From 9cd0798d39840b2b23545bfd5a55788ccaddcb76 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Sat, 17 Dec 2016 03:15:43 +0300
Subject: [PATCH 01/37] Separate percentile numbers from percentile values.

TfwPrcntl{} structure is removed. All data related to percentiles
is flattened into TfwPrcntlStats{}. That makes it easier to maintain
a single (or identical) copy of percentile numbers, as well as copy
just the percentile values on requests. Also, it looks better with
unnecessary hierarchy removed.
---
 tempesta_fw/apm.c    | 94 ++++++++++++++++++++++----------------------
 tempesta_fw/apm.h    | 42 ++++++++++----------
 tempesta_fw/procfs.c | 36 ++++++++---------
 3 files changed, 82 insertions(+), 90 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 6354e6d88..046a25b77 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -289,14 +289,15 @@ static inline bool
 tfw_stats_adj_max(TfwPcntRanges *rng, unsigned int r_time)
 {
 	int old_val, max_val = atomic_read(&rng->max_val);
-	while (1) {
-		if (r_time <= max_val)
-			return false;
+
+	while (r_time > max_val) {
 		old_val = atomic_cmpxchg(&rng->max_val, max_val, r_time);
 		if (likely(old_val == max_val))
 			return true;
 		max_val = old_val;
 	}
+
+	return false;
 }
 
 /*
@@ -308,14 +309,15 @@ static inline bool
 tfw_stats_adj_min(TfwPcntRanges *rng, unsigned int r_time)
 {
 	int old_val, min_val = atomic_read(&rng->min_val);
-	while (1) {
-		if (r_time >= min_val)
-			return false;
+
+	while (r_time < min_val) {
 		old_val = atomic_cmpxchg(&rng->min_val, min_val, r_time);
 		if (likely(old_val == min_val))
 			return true;
 		min_val = old_val;
 	}
+
+	return false;
 }
 
 /**
@@ -440,11 +442,11 @@ typedef struct {
  * of the stored values. The stored values of the latest percentiles are
  * a shared resource that needs a lock to access. An array of two entries
  * is used to decrease the lock contention. Readers read the stored values
- * at @prcntl[@rdidx % 2]. The writer writes the new percentile values to
- * @prcntl[(@rdidx + 1) % 2], and then increments @rdidx. The reading and
+ * at @asent[@rdidx % 2]. The writer writes the new percentile values to
+ * @asent[(@rdidx + 1) % 2], and then increments @rdidx. The reading and
  * the writing are protected by a rwlock.
  * @asent	- The stats entries for reading/writing (flip-flop manner).
- * @rdidx	- The current index in @prcntl for readers.
+ * @rdidx	- The current index in @asent for readers.
  */
 typedef struct {
 	TfwApmSEnt	asent[2];
@@ -487,11 +489,6 @@ static const TfwPcntCtl __read_mostly tfw_rngctl_init[TFW_STATS_RANGES] = {
 	{{4, 109, 349}}
 };
 
-/* A superset of percentiles for all users. */
-static const TfwPrcntl __read_mostly tfw_apm_prcntl[] = {
-	{50}, {75}, {90}, {95}, {99}
-};
-
 static int tfw_apm_jtmwindow;		/* Time window in jiffies. */
 static int tfw_apm_jtmintrvl;		/* Time interval in jiffies. */
 static int tfw_apm_tmwscale;		/* Time window scale. */
@@ -571,7 +568,7 @@ static int
 tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats)
 {
 	int i, p;
-	unsigned long cnt = 0, val, pval[pstats->prcntlsz];
+	unsigned long cnt = 0, val, pval[pstats->psz];
 	TfwApmRBEState st[rbuf->rbufsz];
 	TfwPcntRanges *pcntrng;
 	TfwApmRBEnt *rbent = rbuf->rbent;
@@ -582,12 +579,12 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats
 		__tfw_apm_state_next(pcntrng, &st[i]);
 	}
 	/* The number of items to collect for each percentile. */
-	for (i = 0, p = 0; i < pstats->prcntlsz; ++i) {
-		pval[i] = rbctl->total_cnt * pstats->prcntl[i].ith / 100;
+	for (i = p = 0; i < pstats->psz; ++i) {
+		pval[i] = rbctl->total_cnt * pstats->ith[i] / 100;
 		if (!pval[i])
-			pstats->prcntl[p++].val = 0;
+			pstats->val[p++] = 0;
 	}
-	while (p < pstats->prcntlsz) {
+	while (p < pstats->psz) {
 		int v_min = USHRT_MAX;
 		for (i = 0; i < rbuf->rbufsz; i++) {
 			if (st[i].v < v_min)
@@ -613,8 +610,8 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats
 			cnt += atomic_read(&pcntrng->cnt[st[i].r][st[i].b]);
 			tfw_apm_state_next(pcntrng, &st[i]);
 		}
-		for ( ; p < pstats->prcntlsz && pval[p] <= cnt; ++p)
-			pstats->prcntl[p].val = v_min;
+		for ( ; p < pstats->psz && pval[p] <= cnt; ++p)
+			pstats->val[p] = v_min;
 	}
 	cnt = val = 0;
 	pstats->max = 0;
@@ -774,12 +771,14 @@ static void
 tfw_apm_calc(TfwApmData *data)
 {
 	int nfilled, wridx, recalc;
-	TfwPrcntl prcntl[ARRAY_SIZE(tfw_apm_prcntl)];
-	TfwPrcntlStats pstats = { prcntl, ARRAY_SIZE(prcntl) };
+	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)];
+	TfwPrcntlStats pstats = {
+		.ith = tfw_pstats_ith,
+		.val = val,
+		.psz = ARRAY_SIZE(tfw_pstats_ith)
+	};
 	TfwApmSEnt *asent;
 
-	memcpy(prcntl, tfw_apm_prcntl, sizeof(tfw_apm_prcntl));
-
 	wridx = ((unsigned int)atomic_read(&data->stats.rdidx) + 1) % 2;
 	asent = &data->stats.asent[wridx];
 
@@ -788,14 +787,14 @@ tfw_apm_calc(TfwApmData *data)
 	if (!nfilled)
 		return;
 
-	if (nfilled < asent->pstats.prcntlsz) {
+	if (nfilled < asent->pstats.psz) {
 		TFW_DBG3("%s: Percentile calculation incomplete.\n", __func__);
 		set_bit(TFW_APM_DATA_F_RECALC, &data->flags);
 	} else {
 		TFW_DBG3("%s: Percentile values may have changed.\n", __func__);
 		write_lock(&asent->rwlock);
-		memcpy(asent->pstats.prcntl, prcntl,
-		       asent->pstats.prcntlsz * sizeof(TfwPrcntl));
+		memcpy(asent->pstats.val, pstats.val,
+		       asent->pstats.psz * sizeof(asent->pstats.val[0]));
 		asent->pstats.min = pstats.min;
 		asent->pstats.max = pstats.max;
 		asent->pstats.avg = pstats.avg;
@@ -809,7 +808,7 @@ tfw_apm_calc(TfwApmData *data)
  * Runs periodically on timer. 
  */
 static void
-tfw_apm_prcntl_fn(unsigned long fndata)
+tfw_apm_pstats_fn(unsigned long fndata)
 {
 	TfwApmData *data = (TfwApmData *)fndata;
 
@@ -842,8 +841,8 @@ tfw_apm_prcntl_fn(unsigned long fndata)
 	asent = &data->stats.asent[rdidx];				\
 									\
 	fn_lock(&asent->rwlock);					\
-	memcpy(pstats->prcntl, asent->pstats.prcntl,			\
-	       pstats->prcntlsz * sizeof(TfwPrcntl));			\
+	memcpy(pstats->val, asent->pstats.val,				\
+	       pstats->psz * sizeof(pstats->val[0]));			\
 	pstats->min = asent->pstats.min;				\
 	pstats->max = asent->pstats.max;				\
 	pstats->avg = asent->pstats.avg;				\
@@ -871,14 +870,14 @@ tfw_apm_stats(void *apmdata, TfwPrcntlStats *pstats)
  * All APM Stats users must use the same set of percentiles.
  */
 int
-tfw_apm_prcntl_verify(TfwPrcntl *prcntl, unsigned int prcntlsz)
+tfw_apm_pstats_verify(TfwPrcntlStats *pstats)
 {
 	int i;
 
-	if (prcntlsz != ARRAY_SIZE(tfw_apm_prcntl))
+	if (pstats->psz != ARRAY_SIZE(tfw_pstats_ith))
 		return 1;
-	for (i = 0; i < prcntlsz; ++i)
-		if (prcntl[i].ith != tfw_apm_prcntl[i].ith)
+	for (i = 0; i < pstats->psz; ++i)
+		if (pstats->ith[i] != tfw_pstats_ith[i])
 			return 1;
 	return 0;
 }
@@ -944,10 +943,10 @@ tfw_apm_create(void)
 {
 	TfwApmData *data;
 	TfwApmRBEnt *rbent;
-	TfwPrcntl *prcntl[2];
 	int i, size;
+	unsigned int *val[2];
 	int rbufsz = tfw_apm_tmwscale;
-	int prcntlsz = ARRAY_SIZE(tfw_apm_prcntl);
+	int psz = ARRAY_SIZE(tfw_pstats_ith);
 
 	if (!tfw_apm_tmwscale) {
 		TFW_ERR("Late initialization of 'apm_stats' option\n");
@@ -956,39 +955,38 @@ tfw_apm_create(void)
 
 	/* Keep complete stats for the full time window. */
 	size = sizeof(TfwApmData) + rbufsz * sizeof(TfwApmRBEnt)
-				  + 2 * sizeof(tfw_apm_prcntl);
+				  + 2 * psz * sizeof(unsigned int);
 	if ((data = kzalloc(size, GFP_ATOMIC)) == NULL)
 		return NULL;
 
 	/* Set up memory areas. */
 	rbent = (TfwApmRBEnt *)(data + 1);
-	prcntl[0] = (TfwPrcntl *)(rbent + rbufsz);
-	prcntl[1] = (TfwPrcntl *)(prcntl[0] + prcntlsz);
+	val[0] = (unsigned int *)(rbent + rbufsz);
+	val[1] = (unsigned int *)(val[0] + psz);
 
 	data->rbuf.rbent = rbent;
 	data->rbuf.rbufsz = rbufsz;
 
-	data->stats.asent[0].pstats.prcntl = prcntl[0];
-	data->stats.asent[0].pstats.prcntlsz = prcntlsz;
+	data->stats.asent[0].pstats.ith = tfw_pstats_ith;
+	data->stats.asent[0].pstats.val = val[0];
+	data->stats.asent[0].pstats.psz = psz;
 
-	data->stats.asent[1].pstats.prcntl = prcntl[1];
-	data->stats.asent[1].pstats.prcntlsz = prcntlsz;
+	data->stats.asent[1].pstats.ith = tfw_pstats_ith;
+	data->stats.asent[1].pstats.val = val[1];
+	data->stats.asent[1].pstats.psz = psz;
 
 	/* Initialize data. */
 	for (i = 0; i < rbufsz; ++i)
 		tfw_apm_rbent_init(&rbent[i], 0);
 	spin_lock_init(&data->rbuf.slock);
 
-	memcpy(prcntl[0], tfw_apm_prcntl, sizeof(tfw_apm_prcntl));
-	memcpy(prcntl[1], tfw_apm_prcntl, sizeof(tfw_apm_prcntl));
-
 	rwlock_init(&data->stats.asent[0].rwlock);
 	rwlock_init(&data->stats.asent[1].rwlock);
 	atomic_set(&data->stats.rdidx, 0);
 
 	/* Start the timer for the percentile calculation. */
 	set_bit(TFW_APM_DATA_F_REARM, &data->flags);
-	setup_timer(&data->timer, tfw_apm_prcntl_fn, (unsigned long)data);
+	setup_timer(&data->timer, tfw_apm_pstats_fn, (unsigned long)data);
 	mod_timer(&data->timer, jiffies + TFW_APM_TIMER_TIMEOUT);
 
 	return data;
diff --git a/tempesta_fw/apm.h b/tempesta_fw/apm.h
index f5ca5fbbe..d4dbe1713 100644
--- a/tempesta_fw/apm.h
+++ b/tempesta_fw/apm.h
@@ -23,36 +23,34 @@
 #include "pool.h"
 
 /*
- * @ith	- percentile number.
- * @val	- percentile value.
+ * @ith		- array of percentile numbers;
+ * @val		- array of percentile values;
+ * @psz		- size of @ith and @val arrays;
+ * @min		- minimal value;
+ * @max		- maximal value;
+ * @avg		- average value;
+ * @seq		- opaque data related to percentiles calculation;
  */
 typedef struct {
-	unsigned int	ith;
-	unsigned int	val;
-} TfwPrcntl;
-
-/*
- * @stats	- Percentile Stats array.
- * @stsz	- @stats array size.
- * @min		- Minimal value.
- * @max		- Maximal value.
- * @avg		- Average value.
- * @seq		- opaque data related to percentiles calculation.
- */
-typedef struct {
-	TfwPrcntl	*prcntl;
-	unsigned int	prcntlsz;
-	unsigned int	min;
-	unsigned int	max;
-	unsigned int	avg;
-	unsigned int	seq;
+	const unsigned int	*ith;
+	unsigned int		*val;
+	unsigned int		psz;
+	unsigned int		min;
+	unsigned int		max;
+	unsigned int		avg;
+	unsigned int		seq;
 } TfwPrcntlStats;
 
+/* A superset of percentiles for all users. */
+static const unsigned int __read_mostly tfw_pstats_ith[] = {
+	50, 75, 90, 95, 99
+};
+
 void *tfw_apm_create(void);
 void tfw_apm_destroy(void *data);
 void tfw_apm_update(void *data, unsigned long jtstamp, unsigned long jrtime);
 int tfw_apm_stats(void *data, TfwPrcntlStats *pstats);
 int tfw_apm_stats_bh(void *data, TfwPrcntlStats *pstats);
-int tfw_apm_prcntl_verify(TfwPrcntl *prcntl, unsigned int prcntlsz);
+int tfw_apm_pstats_verify(TfwPrcntlStats *pstats);
 
 #endif /* __TFW_APM_H__ */
diff --git a/tempesta_fw/procfs.c b/tempesta_fw/procfs.c
index 29d1ad4fa..405b9b76e 100644
--- a/tempesta_fw/procfs.c
+++ b/tempesta_fw/procfs.c
@@ -141,14 +141,6 @@ tfw_perfstat_seq_open(struct inode *inode, struct file *file)
 	return single_open(file, tfw_perfstat_seq_show, PDE_DATA(inode));
 }
 
-/*
- * Individual server statistics. Note that 50% percentile
- * is used to tell the median value.
- */
-static const TfwPrcntl __read_mostly tfw_procfs_prcntl[] = {
-	{50}, {75}, {90}, {95}, {99}
-};
-
 static int
 tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 {
@@ -157,21 +149,24 @@ tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 	int i;
 	TfwSrvConn *srv_conn;
 	TfwServer *srv = seq->private;
-	TfwPrcntl prcntl[ARRAY_SIZE(tfw_procfs_prcntl)];
-	TfwPrcntlStats pstats = { prcntl, ARRAY_SIZE(prcntl) };
-
-	memcpy(prcntl, tfw_procfs_prcntl, sizeof(prcntl));
+	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)];
+	TfwPrcntlStats pstats = {
+		.ith = tfw_pstats_ith,
+		.val = val,
+		.psz = ARRAY_SIZE(tfw_pstats_ith)
+	};
 
 	tfw_apm_stats_bh(srv->apm, &pstats);
 
 	SPRNE("Minimal response time\t\t", pstats.min);
 	SPRNE("Average response time\t\t", pstats.avg);
-	SPRNE("Median  response time\t\t", prcntl[0].val);
+	SPRNE("Median  response time\t\t", pstats.val[0]);
 	SPRNE("Maximum response time\t\t", pstats.max);
+
 	seq_printf(seq, "Percentiles\n");
-	for (i = 0; i < ARRAY_SIZE(prcntl); ++i)
-		seq_printf(seq, "\t%02d%%:\t%dms\n",
-				prcntl[i].ith, prcntl[i].val);
+	for (i = 0; i < ARRAY_SIZE(tfw_pstats_ith); ++i)
+		seq_printf(seq, "%02d%%:\t%dms\n",
+				pstats.ith[i], pstats.val[i]);
 	i = 0;
 	seq_printf(seq, "Maximum forwarding queue size\t: %d\n",
 			srv->sg->max_qsize);
@@ -241,13 +236,14 @@ static int
 tfw_procfs_cfg_start(void)
 {
 	int i, ret;
-	TfwPrcntl prcntl[ARRAY_SIZE(tfw_procfs_prcntl)];
-
-	memcpy(prcntl, tfw_procfs_prcntl, sizeof(prcntl));
+	TfwPrcntlStats pstats = {
+		.ith = tfw_pstats_ith,
+		.psz = ARRAY_SIZE(tfw_pstats_ith)
+	};
 
 	if (!tfw_procfs_tempesta)
 		return -ENOENT;
-	if (tfw_apm_prcntl_verify(prcntl, ARRAY_SIZE(prcntl)))
+	if (tfw_apm_pstats_verify(&pstats))
 		return -EINVAL;
 	tfw_procfs_srvstats = proc_mkdir("servers", tfw_procfs_tempesta);
 	if (!tfw_procfs_srvstats)

From 6423d3f9c8531e229f028bdf02ec0e3fe036b682 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Sat, 17 Dec 2016 03:28:21 +0300
Subject: [PATCH 02/37] Remove the possibility of negative array indices.

---
 tempesta_fw/sched/tfw_sched_rr.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_rr.c b/tempesta_fw/sched/tfw_sched_rr.c
index 538749f50..639a2da84 100644
--- a/tempesta_fw/sched/tfw_sched_rr.c
+++ b/tempesta_fw/sched/tfw_sched_rr.c
@@ -32,8 +32,8 @@ MODULE_LICENSE("GPL");
 
 /**
  * List of connections to an upstream server.
- * Connections can up and down during failover process and shouldn't be
- * taken into account by the scheduler.
+ * Connections can go up and down during failover process. Only
+ * fully established connections are considered by the scheduler.
  */
 typedef struct {
 	atomic64_t	rr_counter;
@@ -44,8 +44,9 @@ typedef struct {
 
 /**
  * List of upstream servers.
- * The list is considered static, i.e. all the servers are alive during
- * whole run-time. This can be changed in future.
+ * The list is considered static, i.e. all servers, either dead
+ * or alive, are present in the list during the whole run-time.
+ * That may change in the future.
  */
 typedef struct {
 	atomic64_t	rr_counter;
@@ -67,8 +68,8 @@ tfw_sched_rr_free_data(TfwSrvGroup *sg)
 }
 
 /**
- * Add connection and server, if new, to the scheduler.
- * Called at configuration phase, no synchronization is required.
+ * Add a connection and a server, if new, to the scheduler.
+ * Called at configuration stage, no synchronization is required.
  */
 static void
 tfw_sched_rr_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
@@ -121,7 +122,7 @@ static TfwSrvConn *
 tfw_sched_rr_get_srv_conn(TfwMsg *msg, TfwSrvGroup *sg)
 {
 	size_t c, s;
-	unsigned long idxval;
+	uint64_t idxval;
 	int skipnip = 1, nipconn = 0;
 	TfwRrSrvList *sl = sg->sched_data;
 	TfwRrSrv *srv_cl;

From 2951467d2f9b7801c6b5fca0ea93b09a15c6a6f6 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 19 Dec 2016 00:23:54 +0300
Subject: [PATCH 03/37] Put all APM stats together in the same indexed array.

Minimum, maximum, and average values are now together with percentile
values in the same indexed array. That makes it possible to specify
an index of the value to use in scheduling.
---
 tempesta_fw/apm.c    | 40 +++++++++++++++++++++++-----------------
 tempesta_fw/apm.h    | 30 ++++++++++++++++++++----------
 tempesta_fw/procfs.c | 12 ++++++------
 3 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 046a25b77..d9946a953 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -567,6 +567,11 @@ tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st)
 static int
 tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats)
 {
+#define IDX_MIN		TFW_PSTATS_IDX_MIN
+#define IDX_MAX		TFW_PSTATS_IDX_MAX
+#define IDX_AVG		TFW_PSTATS_IDX_AVG
+#define IDX_ITH		TFW_PSTATS_IDX_ITH
+
 	int i, p;
 	unsigned long cnt = 0, val, pval[pstats->psz];
 	TfwApmRBEState st[rbuf->rbufsz];
@@ -579,7 +584,7 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats
 		__tfw_apm_state_next(pcntrng, &st[i]);
 	}
 	/* The number of items to collect for each percentile. */
-	for (i = p = 0; i < pstats->psz; ++i) {
+	for (i = p = IDX_ITH; i < pstats->psz; ++i) {
 		pval[i] = rbctl->total_cnt * pstats->ith[i] / 100;
 		if (!pval[i])
 			pstats->val[p++] = 0;
@@ -599,8 +604,10 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats
 				 "cnt [%lu] total_cnt [%lu]\n",
 				 __func__, cnt, rbctl->total_cnt);
 			TFW_DBG3("%s: [%lu] [%lu] [%lu] [%lu] [%lu] [%lu]\n",
-				 __func__, pval[0], pval[1], pval[2],
-				 pval[3], pval[4], pval[5]);
+				 __func__,
+				 pval[IDX_ITH], pval[IDX_ITH + 1],
+				 pval[IDX_ITH + 2], pval[IDX_ITH + 3],
+				 pval[IDX_ITH + 4], pval[IDX_ITH + 5]);
 			break;
 		}
 		for (i = 0; i < rbuf->rbufsz; i++) {
@@ -614,21 +621,26 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats
 			pstats->val[p] = v_min;
 	}
 	cnt = val = 0;
-	pstats->max = 0;
-	pstats->min = UINT_MAX;
+	pstats->val[IDX_MAX] = 0;
+	pstats->val[IDX_MIN] = UINT_MAX;
 	for (i = 0; i < rbuf->rbufsz; i++) {
 		pcntrng = &rbent[i].pcntrng;
-		if (pstats->min > atomic_read(&pcntrng->min_val))
-			pstats->min = atomic_read(&pcntrng->min_val);
-		if (pstats->max < atomic_read(&pcntrng->max_val))
-			pstats->max = atomic_read(&pcntrng->max_val);
+		if (pstats->val[IDX_MIN] > atomic_read(&pcntrng->min_val))
+			pstats->val[IDX_MIN] = atomic_read(&pcntrng->min_val);
+		if (pstats->val[IDX_MAX] < atomic_read(&pcntrng->max_val))
+			pstats->val[IDX_MAX] = atomic_read(&pcntrng->max_val);
 		cnt += atomic64_read(&pcntrng->tot_cnt);
 		val += atomic64_read(&pcntrng->tot_val);
 	}
 	if (likely(cnt))
-		pstats->avg = val / cnt;
+		pstats->val[IDX_AVG] = val / cnt;
 
 	return p;
+
+#undef IDX_ITH
+#undef IDX_AVG
+#undef IDX_MAX
+#undef IDX_MIN
 }
 
 /*
@@ -771,7 +783,7 @@ static void
 tfw_apm_calc(TfwApmData *data)
 {
 	int nfilled, wridx, recalc;
-	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)];
+	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
 	TfwPrcntlStats pstats = {
 		.ith = tfw_pstats_ith,
 		.val = val,
@@ -795,9 +807,6 @@ tfw_apm_calc(TfwApmData *data)
 		write_lock(&asent->rwlock);
 		memcpy(asent->pstats.val, pstats.val,
 		       asent->pstats.psz * sizeof(asent->pstats.val[0]));
-		asent->pstats.min = pstats.min;
-		asent->pstats.max = pstats.max;
-		asent->pstats.avg = pstats.avg;
 		atomic_inc(&data->stats.rdidx);
 		write_unlock(&asent->rwlock);
 	}
@@ -843,9 +852,6 @@ tfw_apm_pstats_fn(unsigned long fndata)
 	fn_lock(&asent->rwlock);					\
 	memcpy(pstats->val, asent->pstats.val,				\
 	       pstats->psz * sizeof(pstats->val[0]));			\
-	pstats->min = asent->pstats.min;				\
-	pstats->max = asent->pstats.max;				\
-	pstats->avg = asent->pstats.avg;				\
 	fn_unlock(&asent->rwlock);					\
 	pstats->seq = rdidx;						\
 									\
diff --git a/tempesta_fw/apm.h b/tempesta_fw/apm.h
index d4dbe1713..523a4ad3a 100644
--- a/tempesta_fw/apm.h
+++ b/tempesta_fw/apm.h
@@ -23,27 +23,37 @@
 #include "pool.h"
 
 /*
- * @ith		- array of percentile numbers;
- * @val		- array of percentile values;
+ * @ith		- array of percentile numbers, with space for min/max/avg;
+ * @val		- array of percentile values, and values for min/max/avg;
  * @psz		- size of @ith and @val arrays;
- * @min		- minimal value;
- * @max		- maximal value;
- * @avg		- average value;
  * @seq		- opaque data related to percentiles calculation;
  */
 typedef struct {
 	const unsigned int	*ith;
 	unsigned int		*val;
 	unsigned int		psz;
-	unsigned int		min;
-	unsigned int		max;
-	unsigned int		avg;
 	unsigned int		seq;
 } TfwPrcntlStats;
 
-/* A superset of percentiles for all users. */
+enum {
+	TFW_PSTATS_IDX_MIN = 0,
+	TFW_PSTATS_IDX_MAX,
+	TFW_PSTATS_IDX_AVG,
+	TFW_PSTATS_IDX_ITH,
+	TFW_PSTATS_IDX_P50 = TFW_PSTATS_IDX_ITH,
+	TFW_PSTATS_IDX_P75,
+	TFW_PSTATS_IDX_P90,
+	TFW_PSTATS_IDX_P95,
+	TFW_PSTATS_IDX_P99,
+};
+
 static const unsigned int __read_mostly tfw_pstats_ith[] = {
-	50, 75, 90, 95, 99
+	[TFW_PSTATS_IDX_MIN ... TFW_PSTATS_IDX_AVG] = 0,
+	[TFW_PSTATS_IDX_P50] = 50,
+	[TFW_PSTATS_IDX_P75] = 75,
+	[TFW_PSTATS_IDX_P90] = 90,
+	[TFW_PSTATS_IDX_P95] = 95,
+	[TFW_PSTATS_IDX_P99] = 99,
 };
 
 void *tfw_apm_create(void);
diff --git a/tempesta_fw/procfs.c b/tempesta_fw/procfs.c
index 405b9b76e..8b943c436 100644
--- a/tempesta_fw/procfs.c
+++ b/tempesta_fw/procfs.c
@@ -149,7 +149,7 @@ tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 	int i;
 	TfwSrvConn *srv_conn;
 	TfwServer *srv = seq->private;
-	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)];
+	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
 	TfwPrcntlStats pstats = {
 		.ith = tfw_pstats_ith,
 		.val = val,
@@ -158,13 +158,13 @@ tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 
 	tfw_apm_stats_bh(srv->apm, &pstats);
 
-	SPRNE("Minimal response time\t\t", pstats.min);
-	SPRNE("Average response time\t\t", pstats.avg);
-	SPRNE("Median  response time\t\t", pstats.val[0]);
-	SPRNE("Maximum response time\t\t", pstats.max);
+	SPRNE("Minimal response time\t\t", pstats.val[TFW_PSTATS_IDX_MIN]);
+	SPRNE("Average response time\t\t", pstats.val[TFW_PSTATS_IDX_AVG]);
+	SPRNE("Median  response time\t\t", pstats.val[TFW_PSTATS_IDX_P50]);
+	SPRNE("Maximum response time\t\t", pstats.val[TFW_PSTATS_IDX_MAX]);
 
 	seq_printf(seq, "Percentiles\n");
-	for (i = 0; i < ARRAY_SIZE(tfw_pstats_ith); ++i)
+	for (i = TFW_PSTATS_IDX_ITH; i < ARRAY_SIZE(tfw_pstats_ith); ++i)
 		seq_printf(seq, "%02d%%:\t%dms\n",
 				pstats.ith[i], pstats.val[i]);
 	i = 0;

From d2769016f29ac83411a78913eea40b702fb30bd5 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 20 Dec 2016 15:40:15 +0300
Subject: [PATCH 04/37] Implement scheduler and server options related to
 dynamic sheduling.

"server" directive has new optional argument "weight=<N>". "sched"
directive has a number of new options: one of "static" or "dynamic",
if "dynamic", then one of "minimum", "maximum", "average", or
"percentile". If percentile, then a percentile number, one of those
known to Tempesta (curently 50, 75, 90, 95, 99)/.

The "sched" directive has changed with this patch. There's no more
"sched" option in "srv_group" directive. Instead, "sched" directive
may be specified for a server group. Only one "sched" directive is
allowed per group. Same is true for the implicit group "default".
It doesn't matter where the "sched" directive is specified in the
"srv_group" section, or in the file for servers outside all groups.

If "sched" option is specified at the start of the configuration
file, and a "srv_group" section doesn't have the "sched" directive,
then the values of that outer "sched" directive are propagated to
the server group. If no earlier outer "sched" directive is given,
a default scheduler with default options is used.

Add server connections only when a sheduler is set for a group.
---
 README.md              |   2 +-
 tempesta_fw/apm.h      |   1 +
 tempesta_fw/server.h   |  11 ++-
 tempesta_fw/sock_srv.c | 216 ++++++++++++++++++++++++++++++++---------
 4 files changed, 182 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 5cd0195f0..655a055c5 100644
--- a/README.md
+++ b/README.md
@@ -465,7 +465,7 @@ with this directive. If not specified, the queue size is set to 1000.
 Back end servers can be grouped together into a single unit for the purpose of
 load balancing. Servers within a group are considered interchangeable.
 The load is distributed evenly among servers within a group.
-If a server goes offline, other servers in a group take the load.
+If a server goes offline, then other servers in a group take the load.
 The full syntax is as follows:
 ```
 srv_group <NAME> {
diff --git a/tempesta_fw/apm.h b/tempesta_fw/apm.h
index 523a4ad3a..686eb32d1 100644
--- a/tempesta_fw/apm.h
+++ b/tempesta_fw/apm.h
@@ -45,6 +45,7 @@ enum {
 	TFW_PSTATS_IDX_P90,
 	TFW_PSTATS_IDX_P95,
 	TFW_PSTATS_IDX_P99,
+	_TFW_PSTATS_IDX_COUNT
 };
 
 static const unsigned int __read_mostly tfw_pstats_ith[] = {
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index 400f05393..af7090846 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -38,6 +38,7 @@ typedef struct tfw_scheduler_t TfwScheduler;
  * @list	- member pointer in the list of servers of a server group;
  * @sg		- back-reference to the server group;
  * @apm		- opaque handle for APM stats;
+ * @weight	- static server weight for load balancers;
  */
 typedef struct {
 	TFW_PEER_COMMON;
@@ -45,6 +46,7 @@ typedef struct {
 	TfwSrvGroup		*sg;
 	void			*apm;
 	int			stress;
+	unsigned char		weight;
 } TfwServer;
 
 /**
@@ -82,12 +84,19 @@ struct tfw_srv_group_t {
 
 /* Server related flags. */
 #define TFW_SRV_RETRY_NIP	0x0001	/* Retry non-idemporent req. */
+/*
+ * Lower 4 bits keep an index into APM stats array.
+ */
+#define TFW_SG_F_PSTATS_IDX_MASK	0x000f
+#define TFW_SG_F_SCHED_RATIO_STATIC	0x0010
+#define TFW_SG_F_SCHED_RATIO_DYNAMIC	0x0020
+#define TFW_SG_F_SCHED_RATIO_PREDICT	0x0040
 
 /**
  * Requests scheduling algorithm handler.
  *
  * @name	- name of the algorithm;
- * @list	- list of registered schedulers;
+ * @list	- member in the list of registered schedulers;
  * @add_grp	- add server group to the scheduler;
  * @del_grp	- delete server group from the scheduler;
  * @add_conn	- add connection and server if it's new, called in process
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 0f88b3637..bc884a348 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -25,6 +25,7 @@
 #include <linux/freezer.h>
 #include <net/inet_sock.h>
 
+#include "apm.h"
 #include "tempesta_fw.h"
 #include "connection.h"
 #include "addr.h"
@@ -569,6 +570,9 @@ tfw_sock_srv_delete_all_conns(void)
 #define TFW_CFG_SRV_FWD_RETRIES_DEF	5	/* Default number of tries */
 #define TFW_CFG_SRV_CNS_RETRIES_DEF	10	/* Reconnect tries. */
 #define TFW_CFG_SRV_RETRY_NIP_DEF	0	/* Do NOT resend NIP reqs */
+#define TFW_CFG_SRV_WEIGHT_MIN		1
+#define TFW_CFG_SRV_WEIGHT_MAX		100
+#define TFW_CFG_SRV_WEIGHT_DEF		50
 
 static TfwServer *tfw_cfg_in_slst[TFW_SG_MAX_SRV];
 static TfwServer *tfw_cfg_out_slst[TFW_SG_MAX_SRV];
@@ -590,6 +594,9 @@ static int tfw_cfg_out_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF;
 static int tfw_cfg_out_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF;
 static int tfw_cfg_out_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF;
 
+static unsigned int tfw_cfg_in_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
+static unsigned int tfw_cfg_out_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
+
 static int
 tfw_cfgop_intval(TfwCfgSpec *cs, TfwCfgEntry *ce, int *intval)
 {
@@ -706,48 +713,48 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
 {
 	TfwAddr addr;
 	TfwServer *srv;
-	int i, conns_n = 0;
-	bool has_conns_n = false;
-	const char *key, *val, *saddr;
+	int i, conns_n = 0, weight = 0;
+	bool has_conns_n = false, has_weight = false;
+	const char *key, *val;
 
 	if (ce->val_n != 1) {
-		TFW_ERR_NL("%s: %s %s: Invalid number of arguments: %zd\n",
-			   sg->name, cs->name, ce->val_n ? ce->vals[0] : "",
-			   ce->val_n);
+		TFW_ERR_NL("Invalid number of arguments: %zd\n", ce->val_n);
 		return -EINVAL;
 	}
 	if (ce->attr_n > 2) {
-		TFW_ERR_NL("%s: %s %s: Invalid number of key=value pairs: %zd\n",
-			   sg->name, cs->name, ce->vals[0], ce->attr_n);
+		TFW_ERR_NL("Invalid number of key=value pairs: %zd\n",
+			   ce->attr_n);
 		return -EINVAL;
 	}
 
-	saddr = ce->vals[0];
-
-	if (tfw_addr_pton(&TFW_STR_FROM(saddr), &addr)) {
-		TFW_ERR_NL("%s: %s %s: Invalid IP address: '%s'\n",
-			   sg->name, cs->name, saddr, saddr);
+	if (tfw_addr_pton(&TFW_STR_FROM(ce->vals[0]), &addr)) {
+		TFW_ERR_NL("Invalid IP address: '%s'\n", ce->vals[0]);
 		return -EINVAL;
 	}
 
 	TFW_CFG_ENTRY_FOR_EACH_ATTR(ce, i, key, val) {
 		if (!strcasecmp(key, "conns_n")) {
 			if (has_conns_n) {
-				TFW_ERR_NL("%s: %s %s: Duplicate arg: '%s=%s'"
-					   "\n", sg->name, cs->name, saddr, key,
-					   val);
+				TFW_ERR_NL("Duplicate arg: '%s'\n", key);
 				return -EINVAL;
 			}
 			if (tfw_cfg_parse_int(val, &conns_n)) {
-				TFW_ERR_NL("%s: %s %s: Invalid value: '%s=%s'"
-					   "\n", sg->name, cs->name, saddr, key,
-					   val);
+				TFW_ERR_NL("Invalid value: '%s'\n", val);
 				return -EINVAL;
 			}
 			has_conns_n = true;
+		} else if (!strcasecmp(key, "weight")) {
+			if (has_weight) {
+				TFW_ERR_NL("Duplicate arg: '%s'\n", key);
+				return -EINVAL;
+			}
+			if (tfw_cfg_parse_int(val, &weight)) {
+				TFW_ERR_NL("Invalid value: '%s'\n", val);
+				return -EINVAL;
+			}
+			has_weight = true;
 		} else {
-			TFW_ERR_NL("%s: %s %s: Unsupported argument: '%s=%s'\n",
-				   sg->name, cs->name, saddr, key, val);
+			TFW_ERR_NL("Unsupported argument: '%s'\n", key);
 			return -EINVAL;
 		}
 	}
@@ -755,17 +762,23 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
 	if (!has_conns_n) {
 		conns_n = TFW_CFG_SRV_CONNS_N_DEF;
 	} else if ((conns_n < 1) || (conns_n > TFW_SRV_MAX_CONN)) {
-		TFW_ERR_NL("%s: %s %s: Out of range of [1..%d]: 'conns_n=%d'\n",
-			   sg->name, cs->name, saddr, TFW_SRV_MAX_CONN,
-			   conns_n);
+		TFW_ERR_NL("Out of range of [1..%d]: 'conns_n=%d'\n",
+			   TFW_SRV_MAX_CONN, conns_n);
+		return -EINVAL;
+	}
+	/* Default weight is set only for static ratio scheduler. */
+	if (has_weight && ((weight < 1) || (weight > 100))) {
+		TFW_ERR_NL("Out of range of [%d..%d]: 'weight=%d'",
+			   TFW_CFG_SRV_WEIGHT_MIN, TFW_CFG_SRV_WEIGHT_MAX,
+			   weight);
 		return -EINVAL;
 	}
 
 	if (!(srv = tfw_server_create(&addr))) {
-		TFW_ERR_NL("%s: %s %s: Error handling the server\n",
-			   sg->name, cs->name, saddr);
+		TFW_ERR_NL("Error handling the server: '%s'\n", ce->vals[0]);
 		return -EINVAL;
 	}
+	srv->weight = weight;
 	tfw_sg_add(sg, srv);
 
 	*arg_srv = srv;
@@ -782,7 +795,7 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
  *       server 10.0.0.3 conns_n=1;
  *   }
  *
- * Every server is simply added to the tfw_srv_cfg_curr_group.
+ * Every server is simply added to the tfw_cfg_in_sg.
  */
 static int
 tfw_cfgop_in_server(TfwCfgSpec *cs, TfwCfgEntry *ce)
@@ -835,7 +848,8 @@ tfw_cfgop_out_server(TfwCfgSpec *cs, TfwCfgEntry *ce)
 		static const char __read_mostly s_default[] = "default";
 
 		if (!(tfw_cfg_out_sg = tfw_sg_new(s_default, GFP_KERNEL))) {
-			TFW_ERR_NL("Unable to add default server group\n");
+			TFW_ERR_NL("Unable to add server group '%s'\n",
+				   s_default);
 			return -EINVAL;
 		}
 	}
@@ -864,31 +878,64 @@ static int
 tfw_cfgop_begin_srv_group(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
 	if (ce->val_n != 1) {
-		TFW_ERR_NL("%s %s: Invalid number of arguments: %zd\n",
-			   cs->name, ce->val_n ? ce->vals[0] : "", ce->val_n);
+		TFW_ERR_NL("Invalid number of arguments: %zd\n", ce->val_n);
 			return -EINVAL;
 	}
 	if (ce->attr_n) {
-		TFW_ERR_NL("%s %s: Arguments may not have the \'=\' sign\n",
-			   cs->name, ce->vals[0]);
+		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
 		return -EINVAL;
 	}
 
 	if (!(tfw_cfg_in_sg = tfw_sg_new(ce->vals[0], GFP_KERNEL))) {
-		TFW_ERR_NL("%s %s: Unable to add group\n", cs->name,
-			   ce->vals[0]);
+		TFW_ERR_NL("Unable to add group: '%s'\n", ce->vals[0]);
 		return -EINVAL;
 	}
 
 	TFW_DBG("begin srv_group: %s\n", tfw_cfg_in_sg->name);
 
-	tfw_cfg_in_slstsz = 0;
-	tfw_cfg_in_sched = tfw_cfg_out_sched;
 	tfw_cfg_in_queue_size = tfw_cfg_out_queue_size;
 	tfw_cfg_in_fwd_timeout = tfw_cfg_out_fwd_timeout;
 	tfw_cfg_in_fwd_retries = tfw_cfg_out_fwd_retries;
 	tfw_cfg_in_cns_retries = tfw_cfg_out_cns_retries;
 	tfw_cfg_in_retry_nip = tfw_cfg_out_retry_nip;
+	tfw_cfg_in_sg_flags = tfw_cfg_out_sg_flags;
+	tfw_cfg_in_sched = tfw_cfg_out_sched;
+
+	tfw_cfg_in_slstsz = 0;
+	return 0;
+}
+
+static int
+tfw_cfg_sg_ratio_adjust(TfwSrvGroup *sg, TfwServer **lst, unsigned int lstsz)
+{
+	int i;
+
+	if (sg->flags & TFW_SG_F_SCHED_RATIO_STATIC) {
+		for (i = 0; i < lstsz; ++i) {
+			if (!lst[i]->weight)
+				lst[i]->weight = TFW_CFG_SRV_WEIGHT_DEF;
+		}
+	}
+
+	return 0;
+}
+
+static int
+tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg, TfwServer **lst, unsigned int lstsz)
+{
+	int i;
+
+	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
+		for (i = 0; i < lstsz; ++i)
+			if (lst[i]->weight)
+				break;
+		if (i < lstsz) {
+			TFW_ERR_NL("srv_group: %s: static weight [%d] used "
+				"with 'dynamic' scheduler option\n",
+				sg->name, lst[i]->weight);
+			return -EINVAL;
+		}
+	}
 
 	return 0;
 }
@@ -920,8 +967,17 @@ tfw_cfgop_finish_srv_group(TfwCfgSpec *cs)
 		      ? msecs_to_jiffies(tfw_cfg_in_fwd_timeout * 1000)
 		      : ULONG_MAX;
 	sg->max_refwd = tfw_cfg_in_fwd_retries ? : UINT_MAX;
+	sg->flags = tfw_cfg_in_sg_flags;
 	sg->flags |= tfw_cfg_in_retry_nip ? TFW_SRV_RETRY_NIP : 0;
 
+	if (!strcasecmp(tfw_cfg_in_sched->name, "round-robin")) {
+		if (tfw_cfg_sg_ratio_verify(sg, tfw_cfg_in_slst,
+					    tfw_cfg_in_slstsz))
+			return -EINVAL;
+		if (tfw_cfg_sg_ratio_adjust(sg, tfw_cfg_in_slst,
+					    tfw_cfg_in_slstsz))
+			return -EINVAL;
+	}
 	if (tfw_sg_set_sched(sg, tfw_cfg_in_sched->name)) {
 		TFW_ERR_NL("%s %s: Unable to set scheduler: '%s'\n",
 			   cs->name, sg->name, tfw_cfg_in_sched->name);
@@ -943,31 +999,94 @@ tfw_cfgop_finish_srv_group(TfwCfgSpec *cs)
 	return 0;
 }
 
+static int
+tfw_cfg_handle_ratio(TfwCfgSpec *cs, TfwCfgEntry *ce, unsigned int *sg_flags)
+{
+	unsigned int idx, flags, value;
+
+	if (ce->val_n < 2) {
+		/* Default ratio scheduler type. */
+		flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	} else if (!strcasecmp(ce->vals[1], "static")) {
+		flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	} else if (!strcasecmp(ce->vals[1], "dynamic")) {
+		flags = TFW_SG_F_SCHED_RATIO_DYNAMIC;
+		if (ce->val_n < 3) {
+			/* Default dynamic type. */
+			flags |= TFW_PSTATS_IDX_AVG;
+			goto done;
+		}
+		if (!strcasecmp(ce->vals[2], "minimum")) {
+			idx = TFW_PSTATS_IDX_MIN;
+		}else if (!strcasecmp(ce->vals[2], "maximum")) {
+			idx = TFW_PSTATS_IDX_MAX;
+		} else if (!strcasecmp(ce->vals[2], "average")) {
+			idx = TFW_PSTATS_IDX_AVG;
+		} else if (!strcasecmp(ce->vals[2], "percentile")) {
+			if (ce->val_n < 4) {
+				/* Default percentile. */
+				flags |= TFW_PSTATS_IDX_P90;
+				goto done;
+			}
+			if (tfw_cfg_parse_int(ce->vals[3], &value)) {
+				TFW_ERR_NL("Invalid value: '%s'\n",
+					   ce->vals[3]);
+				return -EINVAL;
+			}
+			for (idx = 0; idx < ARRAY_SIZE(tfw_pstats_ith); ++idx) {
+				if (!tfw_pstats_ith[idx])
+					continue;
+				if (tfw_pstats_ith[idx] == value)
+					break;
+			}
+			if (idx == ARRAY_SIZE(tfw_pstats_ith)) {
+				TFW_ERR_NL("Invalid value: '%s'\n",
+					   ce->vals[3]);
+				return -EINVAL;
+			}
+		} else {
+			TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[2]);
+			return -EINVAL;
+		}
+		flags |= idx;
+	} else {
+		TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[1]);
+		return -EINVAL;
+	}
+
+done:
+	*sg_flags = flags;
+
+	return 0;
+}
+
 /*
  * Common code to handle 'sched' directive.
  */
 static int
-tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce, TfwScheduler **arg_sched)
+tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
+		     TfwScheduler **arg_sched, unsigned int *sg_flags)
 {
 	TfwScheduler *sched;
 
 	if (!ce->val_n) {
-		TFW_ERR_NL("%s: Invalid number of arguments: %zd\n",
-			   cs->name, ce->val_n);
-		return -EINVAL;
+		TFW_ERR_NL("Invalid number of arguments: %zd\n", ce->val_n);
+			return -EINVAL;
 	}
 	if (ce->attr_n) {
-		TFW_ERR_NL("%s %s: Arguments may not have the \'=\' sign\n",
-			   cs->name, ce->vals[0]);
+		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
 		return -EINVAL;
 	}
 
 	if (!(sched = tfw_sched_lookup(ce->vals[0]))) {
-		TFW_ERR_NL("%s %s: Unrecognized scheduler: '%s'\n",
-			   cs->name, ce->vals[0], ce->vals[0]);
+		TFW_ERR_NL("Unrecognized scheduler: '%s'\n", ce->vals[0]);
 		return -EINVAL;
 	}
 
+	if (!strcasecmp(sched->name, "round-robin"))
+		if (tfw_cfg_handle_ratio(cs, ce, sg_flags))
+			return -EINVAL;
+
 	*arg_sched = sched;
 
 	return 0;
@@ -976,13 +1095,15 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce, TfwScheduler **arg_sched)
 static int
 tfw_cfgop_in_sched(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_sched(cs, ce, &tfw_cfg_in_sched);
+	return tfw_cfgop_sched(cs, ce, &tfw_cfg_in_sched,
+				       &tfw_cfg_in_sg_flags);
 }
 
 static int
 tfw_cfgop_out_sched(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_sched(cs, ce, &tfw_cfg_out_sched);
+	return tfw_cfgop_sched(cs, ce, &tfw_cfg_out_sched,
+				       &tfw_cfg_out_sg_flags);
 }
 
 /**
@@ -997,6 +1118,7 @@ tfw_clean_srv_groups(TfwCfgSpec *cs)
 	tfw_cfg_in_sg = tfw_cfg_out_sg = NULL;
 	tfw_cfg_in_sched = tfw_cfg_out_sched = NULL;
 	tfw_cfg_in_slstsz = tfw_cfg_out_slstsz = 0;
+	tfw_cfg_in_sg_flags = tfw_cfg_out_sg_flags = 0;
 }
 
 static int
@@ -1014,6 +1136,7 @@ tfw_sock_srv_start(void)
 			      ? msecs_to_jiffies(tfw_cfg_out_fwd_timeout * 1000)
 			      : ULONG_MAX;
 		sg->max_refwd = tfw_cfg_out_fwd_retries ? : UINT_MAX;
+		sg->flags = tfw_cfg_out_sg_flags;
 		sg->flags |= tfw_cfg_out_retry_nip ? TFW_SRV_RETRY_NIP : 0;
 
 		if (tfw_sg_set_sched(sg, tfw_cfg_out_sched->name)) {
@@ -1182,6 +1305,7 @@ TfwCfgMod tfw_sock_srv_cfg_mod = {
 int
 tfw_sock_srv_init(void)
 {
+	BUILD_BUG_ON(_TFW_PSTATS_IDX_COUNT > TFW_SG_F_PSTATS_IDX_MASK);
 	BUG_ON(tfw_srv_conn_cache);
 	tfw_srv_conn_cache = kmem_cache_create("tfw_srv_conn_cache",
 					       sizeof(TfwSrvConn), 0, 0, NULL);

From 2967e1278798c0ef13dbb60e413dd887865c811d Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Wed, 22 Mar 2017 15:01:02 +0300
Subject: [PATCH 05/37] Rework processing of server-related configuration
 entries.

The scheduler API is called only after all 'srv_group', 'server',
and 'sched' directives in all groups in the configuration file are
processed. That way the number of servers in each group, and the
number of connections for each server are known to a scheduler at
the time its API is called.

The number of servers in a group and the number of connections
to a servers are stored in the respective structures TfwSrvGroup{}
and TfwServer{}. They need to be kept somewhere anyway so serve as
the upper limit for control purposes, especially in scheduler API.

Also, that's useful for dynamic memory allocation in schedulers.
Round-robin and hash schedulers are modified to use the actual
values for the number of servers in a group, and the number of
connections for each server, instead of using predefined constants.
---
 tempesta_fw/http.c                   |   4 +-
 tempesta_fw/sched/tfw_sched_hash.c   |  59 ++--
 tempesta_fw/sched/tfw_sched_rr.c     |  69 ++++-
 tempesta_fw/server.c                 |  10 +-
 tempesta_fw/server.h                 |  35 ++-
 tempesta_fw/sock_srv.c               | 393 ++++++++++++++-------------
 tempesta_fw/t/unit/sched_helper.c    |  59 ++--
 tempesta_fw/t/unit/sched_helper.h    |  12 +-
 tempesta_fw/t/unit/test_sched_hash.c |  49 ++--
 tempesta_fw/t/unit/test_sched_http.c |  88 +++---
 tempesta_fw/t/unit/test_sched_rr.c   |  37 +--
 11 files changed, 480 insertions(+), 335 deletions(-)

diff --git a/tempesta_fw/http.c b/tempesta_fw/http.c
index fdbafc010..4e9e9ea1e 100644
--- a/tempesta_fw/http.c
+++ b/tempesta_fw/http.c
@@ -343,7 +343,7 @@ tfw_http_send_502(TfwHttpReq *req, const char *reason)
 		.flags = 4 << TFW_STR_CN_SHIFT
 	};
 
-	TFW_DBG("Send HTTP 502 response: %s:\n", reason);
+	TFW_DBG("Send HTTP 502 response: %s\n", reason);
 
 	return tfw_http_send_resp(req, &rh, __TFW_STR_CH(&rh, 1));
 }
@@ -368,7 +368,7 @@ tfw_http_send_504(TfwHttpReq *req, const char *reason)
 		.flags = 4 << TFW_STR_CN_SHIFT
 	};
 
-	TFW_DBG("Send HTTP 504 response: %s:\n", reason);
+	TFW_DBG("Send HTTP 504 response: %s\n", reason);
 
 	return tfw_http_send_resp(req, &rh, __TFW_STR_CH(&rh, 1));
 }
diff --git a/tempesta_fw/sched/tfw_sched_hash.c b/tempesta_fw/sched/tfw_sched_hash.c
index c5d110f48..aba766283 100644
--- a/tempesta_fw/sched/tfw_sched_hash.c
+++ b/tempesta_fw/sched/tfw_sched_hash.c
@@ -51,23 +51,43 @@ MODULE_LICENSE("GPL");
 typedef struct {
 	TfwSrvConn	*srv_conn;
 	unsigned long	hash;
+} TfwConnHashEnt;
+
+typedef struct {
+	TfwConnHashEnt	*chent;
+	int		conn_n;
 } TfwConnHash;
 
-/* The last item is used as the list teminator. */
+/* The last item is used as the list terminator. */
 #define __HLIST_SZ(n)		((n) + 1)
-#define __HDATA_SZ(n)		(__HLIST_SZ(n) * sizeof(TfwConnHash))
+#define __HDATA_SZ(n)		(__HLIST_SZ(n) * sizeof(TfwConnHashEnt))
 
-static void
+static int
 tfw_sched_hash_alloc_data(TfwSrvGroup *sg)
 {
-	sg->sched_data = kzalloc(__HDATA_SZ(TFW_SG_MAX_CONN), GFP_KERNEL);
-	BUG_ON(!sg->sched_data);
+	int conn_n = 0;
+	TfwServer *srv;
+	TfwConnHash *ch;
+
+	list_for_each_entry(srv, &sg->srv_list, list)
+		conn_n += srv->conn_n;
+
+	ch = kzalloc(sizeof(TfwConnHash) + __HDATA_SZ(conn_n), GFP_KERNEL);
+	if (!ch)
+		return -ENOMEM;
+
+	ch->chent = (TfwConnHashEnt *)(ch + 1);
+	ch->conn_n = conn_n;
+	sg->sched_data = ch;
+
+	return 0;
 }
 
 static void
 tfw_sched_hash_free_data(TfwSrvGroup *sg)
 {
 	kfree(sg->sched_data);
+	sg->sched_data = NULL;
 }
 
 static unsigned long
@@ -102,14 +122,14 @@ static void
 tfw_sched_hash_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
 {
 	size_t i;
-	TfwConnHash *conn_hash = sg->sched_data;
+	TfwConnHash *ch = sg->sched_data;
 
-	BUG_ON(!conn_hash);
-	for (i = 0; i < __HLIST_SZ(TFW_SG_MAX_CONN); ++i) {
-		if (conn_hash[i].srv_conn)
+	BUG_ON(!ch);
+	for (i = 0; i < __HLIST_SZ(ch->conn_n); ++i) {
+		if (ch->chent[i].srv_conn)
 			continue;
-		conn_hash[i].srv_conn = srv_conn;
-		conn_hash[i].hash = __calc_conn_hash(srv, i);
+		ch->chent[i].srv_conn = srv_conn;
+		ch->chent[i].hash = __calc_conn_hash(srv, i);
 		return;
 	}
 	BUG();
@@ -140,19 +160,20 @@ tfw_sched_hash_get_srv_conn(TfwMsg *msg, TfwSrvGroup *sg)
 {
 	unsigned long tries, msg_hash, curr_weight, best_weight = 0;
 	TfwSrvConn *best_srv_conn = NULL;
-	TfwConnHash *ch;
+	TfwConnHash *ch = sg->sched_data;
+	TfwConnHashEnt *chent;
 
 	msg_hash = tfw_http_req_key_calc((TfwHttpReq *)msg);
-	for (tries = 0; tries < __HLIST_SZ(TFW_SG_MAX_CONN); ++tries) {
-		for (ch = sg->sched_data; ch->srv_conn; ++ch) {
-			if (unlikely(tfw_srv_conn_restricted(ch->srv_conn)
-				     || tfw_srv_conn_queue_full(ch->srv_conn)
-				     || !tfw_srv_conn_live(ch->srv_conn)))
+	for (tries = 0; tries < __HLIST_SZ(ch->conn_n); ++tries) {
+		for (chent = ch->chent; chent->srv_conn; ++chent) {
+			if (unlikely(tfw_srv_conn_restricted(chent->srv_conn)
+				     || tfw_srv_conn_queue_full(chent->srv_conn)
+				     || !tfw_srv_conn_live(chent->srv_conn)))
 				continue;
-			curr_weight = msg_hash ^ ch->hash;
+			curr_weight = msg_hash ^ chent->hash;
 			if (curr_weight > best_weight) {
 				best_weight = curr_weight;
-				best_srv_conn = ch->srv_conn;
+				best_srv_conn = chent->srv_conn;
 			}
 		}
 		if (unlikely(!best_srv_conn))
diff --git a/tempesta_fw/sched/tfw_sched_rr.c b/tempesta_fw/sched/tfw_sched_rr.c
index 639a2da84..658135d37 100644
--- a/tempesta_fw/sched/tfw_sched_rr.c
+++ b/tempesta_fw/sched/tfw_sched_rr.c
@@ -39,7 +39,7 @@ typedef struct {
 	atomic64_t	rr_counter;
 	size_t		conn_n;
 	TfwServer	*srv;
-	TfwSrvConn	*conns[TFW_SRV_MAX_CONN];
+	TfwSrvConn	**conns;
 } TfwRrSrv;
 
 /**
@@ -51,20 +51,69 @@ typedef struct {
 typedef struct {
 	atomic64_t	rr_counter;
 	size_t		srv_n;
-	TfwRrSrv	srvs[TFW_SG_MAX_SRV];
+	TfwRrSrv	*srvs;
 } TfwRrSrvList;
 
 static void
+tfw_sched_rr_cleanup(TfwSrvGroup *sg)
+{
+	size_t s;
+	TfwRrSrvList *sl = sg->sched_data;
+
+	if (!sl)
+		return;
+	for (s = 0; s < sg->srv_n; ++s)
+		if (sl->srvs[s].conns)
+			kfree(sl->srvs[s].conns);
+	kfree(sl->srvs);
+	kfree(sl);
+	sg->sched_data = NULL;
+}
+
+static int
 tfw_sched_rr_alloc_data(TfwSrvGroup *sg)
 {
+	int ret;
+	size_t size, srv_n = 0;
+	TfwRrSrvList *sl;
+	TfwServer *srv;
+
 	sg->sched_data = kzalloc(sizeof(TfwRrSrvList), GFP_KERNEL);
-	BUG_ON(!sg->sched_data);
+	if (!sg->sched_data)
+		return -ENOMEM;
+	sl = sg->sched_data;
+
+	sl->srvs = kzalloc(sizeof(TfwRrSrv) * sg->srv_n, GFP_KERNEL);
+	if (!sl->srvs) {
+		kfree(sl);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(srv, &sg->srv_list, list) {
+		if (srv_n >= sg->srv_n) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
+		size = sizeof(TfwSrvConn *) * srv->conn_n;
+		sl->srvs[srv_n].conns = kzalloc(size, GFP_KERNEL);
+		if (!sl->srvs[srv_n].conns) {
+			ret = -ENOMEM;
+			goto cleanup;
+		}
+		++srv_n;
+	}
+
+	return 0;
+
+cleanup:
+	tfw_sched_rr_cleanup(sg);
+	return ret;
 }
 
 static void
 tfw_sched_rr_free_data(TfwSrvGroup *sg)
 {
-	kfree(sg->sched_data);
+	tfw_sched_rr_cleanup(sg);
 }
 
 /**
@@ -83,22 +132,26 @@ tfw_sched_rr_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
 	for (s = 0; s < sl->srv_n; ++s)
 		if (sl->srvs[s].srv == srv)
 			break;
+	BUG_ON(s >= sg->srv_n);
+
 	if (s == sl->srv_n) {
 		sl->srvs[s].srv = srv;
 		++sl->srv_n;
-		BUG_ON(sl->srv_n > TFW_SG_MAX_SRV);
 	}
 
 	srv_cl = &sl->srvs[s];
 	for (c = 0; c < srv_cl->conn_n; ++c)
 		if (srv_cl->conns[c] == srv_conn) {
-			TFW_WARN("sched_rr: Try to add existing connection,"
-				 " srv=%zu conn=%zu\n", s, c);
+			TFW_WARN("sched '%s': attempt to add an existing "
+				 "connection: srv_group '%s' server '%zd' "
+				 "connection '%zd'\n",
+				 sg->sched->name, sg->name, s, c);
 			return;
 		}
+	BUG_ON(c >= srv->conn_n);
+
 	srv_cl->conns[c] = srv_conn;
 	++srv_cl->conn_n;
-	BUG_ON(srv_cl->conn_n > TFW_SRV_MAX_CONN);
 }
 
 /**
diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c
index 5ebae3c2a..1eb0f7ad5 100644
--- a/tempesta_fw/server.c
+++ b/tempesta_fw/server.c
@@ -117,15 +117,13 @@ tfw_sg_new(const char *name, gfp_t flags)
 
 	TFW_DBG("new server group: '%s'\n", name);
 
-	sg = kmalloc(sizeof(*sg) + name_size, flags);
+	sg = kmalloc(sizeof(*sg) + name_size, flags | __GFP_ZERO);
 	if (!sg)
 		return NULL;
 
 	INIT_LIST_HEAD(&sg->list);
 	INIT_LIST_HEAD(&sg->srv_list);
 	rwlock_init(&sg->lock);
-	sg->sched = NULL;
-	sg->sched_data = NULL;
 	memcpy(sg->name, name, name_size);
 
 	write_lock(&sg_lock);
@@ -161,9 +159,8 @@ tfw_sg_count(void)
 	TfwSrvGroup *sg;
 
 	read_lock(&sg_lock);
-	list_for_each_entry(sg, &sg_list, list) {
+	list_for_each_entry(sg, &sg_list, list)
 		++count;
-	}
 	read_unlock(&sg_lock);
 
 	return count;
@@ -181,6 +178,7 @@ tfw_sg_add(TfwSrvGroup *sg, TfwServer *srv)
 	TFW_DBG2("Add new backend server\n");
 	write_lock(&sg->lock);
 	list_add(&srv->list, &sg->srv_list);
+	++sg->srv_n;
 	write_unlock(&sg->lock);
 }
 
@@ -201,7 +199,7 @@ tfw_sg_set_sched(TfwSrvGroup *sg, const char *sched_name)
 
 	sg->sched = s;
 	if (s->add_grp)
-		s->add_grp(sg);
+		return s->add_grp(sg);
 
 	return 0;
 }
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index af7090846..73e9cee0e 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -25,9 +25,14 @@
 #include "connection.h"
 #include "peer.h"
 
-#define TFW_SRV_MAX_CONN	32	/* TfwSrvConn{} per TfwServer{} */
-#define TFW_SG_MAX_SRV		32	/* TfwServer{} per TfwSrvGroup{} */
-#define TFW_SG_MAX_CONN		(TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN)
+/*
+ * Maximum values for the number of upstream servers in a group,
+ * and the number of connections of an upstream server.
+ */
+#define TFW_SRV_MAX_CONN_N	USHRT_MAX
+#define TFW_SG_MAX_SRV_N	USHRT_MAX
+#define TFW_SG_MAX_CONN_N	\
+	((unsigned long)TFW_SG_MAX_SRV_N * TFW_SRV_MAX_CONN_N)
 
 typedef struct tfw_srv_group_t TfwSrvGroup;
 typedef struct tfw_scheduler_t TfwScheduler;
@@ -39,6 +44,7 @@ typedef struct tfw_scheduler_t TfwScheduler;
  * @sg		- back-reference to the server group;
  * @apm		- opaque handle for APM stats;
  * @weight	- static server weight for load balancers;
+ * @conn_n	- configured number of connections to the server;
  */
 typedef struct {
 	TFW_PEER_COMMON;
@@ -46,7 +52,8 @@ typedef struct {
 	TfwSrvGroup		*sg;
 	void			*apm;
 	int			stress;
-	unsigned char		weight;
+	int			weight;
+	int			conn_n;
 } TfwServer;
 
 /**
@@ -61,6 +68,7 @@ typedef struct {
  * @lock	- synchronizes the group readers with updaters;
  * @sched	- requests scheduling handler;
  * @sched_data	- private scheduler data for the server group;
+ * @srv_n	- configured number of servers in the group;
  * @max_qsize	- maximum queue size of a server connection;
  * @max_refwd	- maximum number of tries for forwarding a request;
  * @max_jqage	- maximum age of a request in a server connection, in jiffies;
@@ -74,6 +82,7 @@ struct tfw_srv_group_t {
 	rwlock_t		lock;
 	TfwScheduler		*sched;
 	void			*sched_data;
+	int			srv_n;
 	unsigned int		max_qsize;
 	unsigned int		max_refwd;
 	unsigned long		max_jqage;
@@ -97,14 +106,16 @@ struct tfw_srv_group_t {
  *
  * @name	- name of the algorithm;
  * @list	- member in the list of registered schedulers;
- * @add_grp	- add server group to the scheduler;
+ * @add_grp	- add server group to the scheduler.
+		  Called in process context at configuration time.
+ *		  Called only after the group is set up with all servers;
  * @del_grp	- delete server group from the scheduler;
- * @add_conn	- add connection and server if it's new, called in process
- * 		  context at configuration time;
- * @sched_grp	- server group scheduling virtual method, typically returns
- *		  result of underlying @sched_srv();
- * @sched_srv	- requests scheduling virtual method, can be called in heavy
- *		  concurrent environment;
+ * @add_conn	- add connection and server if it's new.
+		  Called in process context at configuration time;
+ * @sched_grp	- server group scheduling virtual method.
+		  Typically returns the result of underlying @sched_srv();
+ * @sched_srv	- requests scheduling virtual method.
+		  May be called in heavily concurrent environment;
  *
  * All schedulers must be able to scheduler messages among servers of one
  * server group, i.e. @sched_srv must be defined.
@@ -118,7 +129,7 @@ struct tfw_srv_group_t {
 struct tfw_scheduler_t {
 	const char		*name;
 	struct list_head	list;
-	void			(*add_grp)(TfwSrvGroup *sg);
+	int			(*add_grp)(TfwSrvGroup *sg);
 	void			(*del_grp)(TfwSrvGroup *sg);
 	void			(*add_conn)(TfwSrvGroup *sg, TfwServer *srv,
 					    TfwSrvConn *srv_conn);
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index bc884a348..865be77d3 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -438,6 +438,13 @@ tfw_sock_srv_disconnect(TfwConn *conn)
  * not-yet-established connections in the TfwServer->conn_list.
  */
 
+static inline int
+__tfw_sock_srv_connect_try_later_cb(TfwSrvConn *srv_conn)
+{
+	tfw_sock_srv_connect_try_later(srv_conn);
+	return 0;
+}
+
 static int
 tfw_sock_srv_connect_srv(TfwServer *srv)
 {
@@ -451,10 +458,8 @@ tfw_sock_srv_connect_srv(TfwServer *srv)
 	 * is locked, and spews lots of warnings. LOCKDEP doesn't know
 	 * that parallel execution can't happen with the same socket.
 	 */
-	list_for_each_entry(srv_conn, &srv->conn_list, list)
-		tfw_sock_srv_connect_try_later(srv_conn);
-
-	return 0;
+	return tfw_peer_for_each_conn(srv, srv_conn, list,
+				      __tfw_sock_srv_connect_try_later_cb);
 }
 
 /**
@@ -465,7 +470,8 @@ tfw_sock_srv_disconnect_srv(TfwServer *srv)
 {
 	TfwConn *conn;
 
-	return tfw_peer_for_each_conn(srv, conn, list, tfw_sock_srv_disconnect);
+	return tfw_peer_for_each_conn(srv, conn, list,
+				      tfw_sock_srv_disconnect);
 }
 
 /*
@@ -520,18 +526,34 @@ tfw_srv_conn_free(TfwSrvConn *srv_conn)
 	kmem_cache_free(tfw_srv_conn_cache, srv_conn);
 }
 
+static inline int
+__tfw_sock_srv_sg_add_conn_cb(TfwSrvConn *srv_conn)
+{
+	TfwServer *srv = (TfwServer *)srv_conn->peer;
+	tfw_sg_add_conn(srv->sg, srv, srv_conn);
+
+	return 0;
+}
+
 static int
-tfw_sock_srv_add_conns(TfwServer *srv, int conns_n)
+tfw_sock_srv_sg_add_conns(TfwServer *srv)
+{
+	TfwSrvConn *srv_conn;
+
+	return tfw_peer_for_each_conn(srv, srv_conn, list,
+				      __tfw_sock_srv_sg_add_conn_cb);
+}
+
+static int
+tfw_sock_srv_add_conns(TfwServer *srv)
 {
 	int i;
 	TfwSrvConn *srv_conn;
 
-	for (i = 0; i < conns_n; ++i) {
+	for (i = 0; i < srv->conn_n; ++i) {
 		if (!(srv_conn = tfw_srv_conn_alloc()))
 			return -ENOMEM;
-		tfw_connection_link_peer((TfwConn *)srv_conn,
-					 (TfwPeer *)srv);
-		tfw_sg_add_conn(srv->sg, srv, srv_conn);
+		tfw_connection_link_peer((TfwConn *)srv_conn, (TfwPeer *)srv);
 	}
 
 	return 0;
@@ -546,6 +568,7 @@ tfw_sock_srv_del_conns(TfwServer *srv)
 		tfw_connection_unlink_from_peer((TfwConn *)srv_conn);
 		tfw_srv_conn_free(srv_conn);
 	}
+
 	return 0;
 }
 
@@ -573,20 +596,20 @@ tfw_sock_srv_delete_all_conns(void)
 #define TFW_CFG_SRV_WEIGHT_MIN		1
 #define TFW_CFG_SRV_WEIGHT_MAX		100
 #define TFW_CFG_SRV_WEIGHT_DEF		50
+#define TFW_CFG_SG_NAME_DEF		"default"
 
-static TfwServer *tfw_cfg_in_slst[TFW_SG_MAX_SRV];
-static TfwServer *tfw_cfg_out_slst[TFW_SG_MAX_SRV];
-static int tfw_cfg_in_nconn[TFW_SG_MAX_SRV];
-static int tfw_cfg_out_nconn[TFW_SG_MAX_SRV];
-static int tfw_cfg_in_slstsz, tfw_cfg_out_slstsz;
-static TfwScheduler *tfw_cfg_in_sched, *tfw_cfg_out_sched;
-static TfwSrvGroup *tfw_cfg_in_sg, *tfw_cfg_out_sg;
+static struct list_head tfw_cfg_in_slst = LIST_HEAD_INIT(tfw_cfg_in_slst);
+static struct list_head tfw_cfg_out_slst = LIST_HEAD_INIT(tfw_cfg_out_slst);
+static struct list_head *tfw_cfg_slst;
+static int tfw_cfg_slstsz, tfw_cfg_out_slstsz;
+static TfwScheduler *tfw_cfg_sched, *tfw_cfg_out_sched;
+static TfwSrvGroup *tfw_cfg_sg, *tfw_cfg_out_sg;
 
-static int tfw_cfg_in_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF;
-static int tfw_cfg_in_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF;
-static int tfw_cfg_in_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF;
-static int tfw_cfg_in_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF;
-static int tfw_cfg_in_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF;
+static int tfw_cfg_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF;
+static int tfw_cfg_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF;
+static int tfw_cfg_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF;
+static int tfw_cfg_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF;
+static int tfw_cfg_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF;
 
 static int tfw_cfg_out_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF;
 static int tfw_cfg_out_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF;
@@ -594,7 +617,7 @@ static int tfw_cfg_out_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF;
 static int tfw_cfg_out_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF;
 static int tfw_cfg_out_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF;
 
-static unsigned int tfw_cfg_in_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
+static unsigned int tfw_cfg_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
 static unsigned int tfw_cfg_out_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
 
 static int
@@ -602,14 +625,12 @@ tfw_cfgop_intval(TfwCfgSpec *cs, TfwCfgEntry *ce, int *intval)
 {
 	int ret;
 
-	if (ce->attr_n) {
-		TFW_ERR_NL("%s: Arguments may not have the \'=\' sign\n",
-			   cs->name);
-		return -EINVAL;
-	}
 	if (ce->val_n != 1) {
-		TFW_ERR_NL("%s: Invalid number of arguments: %d\n",
-			   cs->name, (int)ce->val_n);
+		TFW_ERR_NL("Invalid number of arguments: %zd\n", ce->val_n);
+			return -EINVAL;
+	}
+	if (ce->attr_n) {
+		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
 		return -EINVAL;
 	}
 	if ((ret = tfw_cfg_parse_int(ce->vals[0], intval)))
@@ -621,7 +642,7 @@ tfw_cfgop_intval(TfwCfgSpec *cs, TfwCfgEntry *ce, int *intval)
 static int
 tfw_cfgop_in_queue_size(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_queue_size);
+	return tfw_cfgop_intval(cs, ce, &tfw_cfg_queue_size);
 }
 
 static int
@@ -633,7 +654,7 @@ tfw_cfgop_out_queue_size(TfwCfgSpec *cs, TfwCfgEntry *ce)
 static int
 tfw_cfgop_in_fwd_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_fwd_timeout);
+	return tfw_cfgop_intval(cs, ce, &tfw_cfg_fwd_timeout);
 }
 
 static int
@@ -645,7 +666,7 @@ tfw_cfgop_out_fwd_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce)
 static int
 tfw_cfgop_in_fwd_retries(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_fwd_retries);
+	return tfw_cfgop_intval(cs, ce, &tfw_cfg_fwd_retries);
 }
 
 static int
@@ -658,8 +679,7 @@ static inline int
 tfw_cfgop_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce, int *retry_nip)
 {
 	if (ce->attr_n || ce->val_n) {
-		TFW_ERR_NL("%s: The option may not have arguments.\n",
-			   cs->name);
+		TFW_ERR_NL("The option may not have arguments.\n");
 		return -EINVAL;
 	}
 	*retry_nip = 1;
@@ -669,7 +689,7 @@ tfw_cfgop_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce, int *retry_nip)
 static int
 tfw_cfgop_in_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_retry_nip(cs, ce, &tfw_cfg_in_retry_nip);
+	return tfw_cfgop_retry_nip(cs, ce, &tfw_cfg_retry_nip);
 }
 
 static int
@@ -681,7 +701,7 @@ tfw_cfgop_out_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce)
 static int
 tfw_cfgop_in_conn_retries(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_cns_retries);
+	return tfw_cfgop_intval(cs, ce, &tfw_cfg_cns_retries);
 }
 
 static int
@@ -708,8 +728,7 @@ tfw_cfgop_set_conn_retries(TfwSrvGroup *sg, int recns)
  * Common code to handle 'server' directive.
  */
 static int
-tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
-		 TfwSrvGroup *sg, TfwServer **arg_srv, int *arg_conns_n)
+tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce, struct list_head *slst)
 {
 	TfwAddr addr;
 	TfwServer *srv;
@@ -735,7 +754,7 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
 	TFW_CFG_ENTRY_FOR_EACH_ATTR(ce, i, key, val) {
 		if (!strcasecmp(key, "conns_n")) {
 			if (has_conns_n) {
-				TFW_ERR_NL("Duplicate arg: '%s'\n", key);
+				TFW_ERR_NL("Duplicate argument: '%s'\n", key);
 				return -EINVAL;
 			}
 			if (tfw_cfg_parse_int(val, &conns_n)) {
@@ -745,7 +764,7 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
 			has_conns_n = true;
 		} else if (!strcasecmp(key, "weight")) {
 			if (has_weight) {
-				TFW_ERR_NL("Duplicate arg: '%s'\n", key);
+				TFW_ERR_NL("Duplicate argument: '%s'\n", key);
 				return -EINVAL;
 			}
 			if (tfw_cfg_parse_int(val, &weight)) {
@@ -761,14 +780,14 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
 
 	if (!has_conns_n) {
 		conns_n = TFW_CFG_SRV_CONNS_N_DEF;
-	} else if ((conns_n < 1) || (conns_n > TFW_SRV_MAX_CONN)) {
+	} else if ((conns_n < 1) || (conns_n > TFW_SRV_MAX_CONN_N)) {
 		TFW_ERR_NL("Out of range of [1..%d]: 'conns_n=%d'\n",
-			   TFW_SRV_MAX_CONN, conns_n);
+			   TFW_SRV_MAX_CONN_N, conns_n);
 		return -EINVAL;
 	}
 	/* Default weight is set only for static ratio scheduler. */
 	if (has_weight && ((weight < 1) || (weight > 100))) {
-		TFW_ERR_NL("Out of range of [%d..%d]: 'weight=%d'",
+		TFW_ERR_NL("Out of range of [%d..%d]: 'weight=%d'\n",
 			   TFW_CFG_SRV_WEIGHT_MIN, TFW_CFG_SRV_WEIGHT_MAX,
 			   weight);
 		return -EINVAL;
@@ -779,10 +798,8 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
 		return -EINVAL;
 	}
 	srv->weight = weight;
-	tfw_sg_add(sg, srv);
-
-	*arg_srv = srv;
-	*arg_conns_n = conns_n;
+	srv->conn_n = conns_n;
+	list_add_tail(&srv->list, slst);
 
 	return 0;
 }
@@ -794,21 +811,13 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce,
  *       server 10.0.0.2;
  *       server 10.0.0.3 conns_n=1;
  *   }
- *
- * Every server is simply added to the tfw_cfg_in_sg.
  */
 static int
 tfw_cfgop_in_server(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	int nconn;
-	TfwServer *srv;
-
-	if (tfw_cfg_in_slstsz >= TFW_SG_MAX_SRV)
+	if (tfw_cfgop_server(cs, ce, tfw_cfg_slst))
 		return -EINVAL;
-	if (tfw_cfgop_server(cs, ce, tfw_cfg_in_sg, &srv, &nconn))
-		return -EINVAL;
-	tfw_cfg_in_nconn[tfw_cfg_in_slstsz] = nconn;
-	tfw_cfg_in_slst[tfw_cfg_in_slstsz++] = srv;
+	tfw_cfg_slstsz++;
 
 	return 0;
 }
@@ -835,29 +844,9 @@ tfw_cfgop_in_server(TfwCfgSpec *cs, TfwCfgEntry *ce)
 static int
 tfw_cfgop_out_server(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	int nconn;
-	TfwServer *srv;
-
-	if (tfw_cfg_out_slstsz >= TFW_SG_MAX_SRV)
+	if (tfw_cfgop_server(cs, ce, &tfw_cfg_out_slst))
 		return -EINVAL;
-	/*
-	 * The group "default" is created implicitly, and only when
-	 * a server outside of any group is found in the configuration.
-	 */
-	if (!tfw_cfg_out_sg) {
-		static const char __read_mostly s_default[] = "default";
-
-		if (!(tfw_cfg_out_sg = tfw_sg_new(s_default, GFP_KERNEL))) {
-			TFW_ERR_NL("Unable to add server group '%s'\n",
-				   s_default);
-			return -EINVAL;
-		}
-	}
-
-	if (tfw_cfgop_server(cs, ce, tfw_cfg_out_sg, &srv, &nconn))
-		return -EINVAL;
-	tfw_cfg_out_nconn[tfw_cfg_out_slstsz] = nconn;
-	tfw_cfg_out_slst[tfw_cfg_out_slstsz++] = srv;
+	tfw_cfg_out_slstsz++;
 
 	return 0;
 }
@@ -879,60 +868,65 @@ tfw_cfgop_begin_srv_group(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
 	if (ce->val_n != 1) {
 		TFW_ERR_NL("Invalid number of arguments: %zd\n", ce->val_n);
-			return -EINVAL;
+		return -EINVAL;
 	}
 	if (ce->attr_n) {
 		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
 		return -EINVAL;
 	}
 
-	if (!(tfw_cfg_in_sg = tfw_sg_new(ce->vals[0], GFP_KERNEL))) {
+	if (!(tfw_cfg_sg = tfw_sg_new(ce->vals[0], GFP_KERNEL))) {
 		TFW_ERR_NL("Unable to add group: '%s'\n", ce->vals[0]);
 		return -EINVAL;
 	}
 
-	TFW_DBG("begin srv_group: %s\n", tfw_cfg_in_sg->name);
+	TFW_DBG("begin srv_group: %s\n", tfw_cfg_sg->name);
 
-	tfw_cfg_in_queue_size = tfw_cfg_out_queue_size;
-	tfw_cfg_in_fwd_timeout = tfw_cfg_out_fwd_timeout;
-	tfw_cfg_in_fwd_retries = tfw_cfg_out_fwd_retries;
-	tfw_cfg_in_cns_retries = tfw_cfg_out_cns_retries;
-	tfw_cfg_in_retry_nip = tfw_cfg_out_retry_nip;
-	tfw_cfg_in_sg_flags = tfw_cfg_out_sg_flags;
-	tfw_cfg_in_sched = tfw_cfg_out_sched;
+	tfw_cfg_queue_size = tfw_cfg_out_queue_size;
+	tfw_cfg_fwd_timeout = tfw_cfg_out_fwd_timeout;
+	tfw_cfg_fwd_retries = tfw_cfg_out_fwd_retries;
+	tfw_cfg_cns_retries = tfw_cfg_out_cns_retries;
+	tfw_cfg_retry_nip = tfw_cfg_out_retry_nip;
+	tfw_cfg_sg_flags = tfw_cfg_out_sg_flags;
+	tfw_cfg_sched = tfw_cfg_out_sched;
+
+	BUG_ON(!list_empty(&tfw_cfg_in_slst));
+	tfw_cfg_slst = &tfw_cfg_in_slst;
+	tfw_cfg_slstsz = 0;
 
-	tfw_cfg_in_slstsz = 0;
 	return 0;
 }
 
 static int
-tfw_cfg_sg_ratio_adjust(TfwSrvGroup *sg, TfwServer **lst, unsigned int lstsz)
+tfw_cfg_sg_ratio_adjust(TfwSrvGroup *sg)
 {
-	int i;
+	TfwServer *srv;
 
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_STATIC) {
-		for (i = 0; i < lstsz; ++i) {
-			if (!lst[i]->weight)
-				lst[i]->weight = TFW_CFG_SRV_WEIGHT_DEF;
-		}
+		list_for_each_entry(srv, tfw_cfg_slst, list)
+			if (!srv->weight)
+				srv->weight = TFW_CFG_SRV_WEIGHT_DEF;
 	}
 
 	return 0;
 }
 
 static int
-tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg, TfwServer **lst, unsigned int lstsz)
+tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg)
 {
-	int i;
+	TfwServer *srv;
+	int count = 0;
 
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
-		for (i = 0; i < lstsz; ++i)
-			if (lst[i]->weight)
+		list_for_each_entry(srv, tfw_cfg_slst, list) {
+			if (srv->weight)
 				break;
-		if (i < lstsz) {
-			TFW_ERR_NL("srv_group: %s: static weight [%d] used "
-				"with 'dynamic' scheduler option\n",
-				sg->name, lst[i]->weight);
+			++count;
+		}
+		if (count < tfw_cfg_slstsz) {
+			TFW_ERR_NL("srv_group %s: static weight [%d] used "
+				   "with 'dynamic' scheduler option\n",
+				   sg->name, srv->weight);
 			return -EINVAL;
 		}
 	}
@@ -940,6 +934,53 @@ tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg, TfwServer **lst, unsigned int lstsz)
 	return 0;
 }
 
+static int
+tfw_cfgop_setup_srv_group(void)
+{
+	int ret;
+	TfwServer *srv, *tmp;
+
+	BUG_ON(!tfw_cfg_sg);
+	BUG_ON(!tfw_cfg_sched);
+
+	tfw_cfgop_set_conn_retries(tfw_cfg_sg, tfw_cfg_cns_retries);
+	tfw_cfg_sg->max_qsize = tfw_cfg_queue_size ? : UINT_MAX;
+	tfw_cfg_sg->max_jqage = tfw_cfg_fwd_timeout
+			      ? msecs_to_jiffies(tfw_cfg_fwd_timeout * 1000)
+			      : ULONG_MAX;
+	tfw_cfg_sg->max_refwd = tfw_cfg_fwd_retries ? : UINT_MAX;
+
+	tfw_cfg_sg->flags = tfw_cfg_sg_flags;
+	tfw_cfg_sg->flags |= tfw_cfg_retry_nip ? TFW_SRV_RETRY_NIP : 0;
+
+	/* Check 'ratio' scheduler configuration for incompatibilities. */
+	if (!strcasecmp(tfw_cfg_sched->name, "ratio")) {
+		if (tfw_cfg_sg_ratio_verify(tfw_cfg_sg))
+			return -EINVAL;
+		if (tfw_cfg_sg_ratio_adjust(tfw_cfg_sg))
+			return -EINVAL;
+	}
+	/* Set up the server group with all servers that are in it. */
+	list_for_each_entry_safe(srv, tmp, tfw_cfg_slst, list) {
+		if ((ret = tfw_sock_srv_add_conns(srv)) != 0)
+			return ret;
+		list_del(&srv->list);
+		tfw_sg_add(tfw_cfg_sg, srv);
+	}
+	/*
+	 * Set up a scheduler and add the server group to the scheduler.
+	 * Must be called only after the server group is set up with all
+	 * servers (and all connections) that are in it.
+	 */
+	if (tfw_sg_set_sched(tfw_cfg_sg, tfw_cfg_sched->name)) {
+		TFW_ERR_NL("Unable to add srv_group '%s' to scheduler '%s'\n",
+			   tfw_cfg_sg->name, tfw_cfg_sched->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * The callback is invoked upon exit from a "srv_group" when all nested
  * directives are parsed, e.g.:
@@ -953,50 +994,10 @@ tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg, TfwServer **lst, unsigned int lstsz)
 static int
 tfw_cfgop_finish_srv_group(TfwCfgSpec *cs)
 {
-	int i;
-	TfwSrvGroup *sg = tfw_cfg_in_sg;
-
-	BUG_ON(!sg);
-	BUG_ON(list_empty(&sg->srv_list));
-	BUG_ON(!tfw_cfg_in_sched);
-	TFW_DBG("finish srv_group: %s\n", sg->name);
-
-	tfw_cfgop_set_conn_retries(sg, tfw_cfg_in_cns_retries);
-	sg->max_qsize = tfw_cfg_in_queue_size ? : UINT_MAX;
-	sg->max_jqage = tfw_cfg_in_fwd_timeout
-		      ? msecs_to_jiffies(tfw_cfg_in_fwd_timeout * 1000)
-		      : ULONG_MAX;
-	sg->max_refwd = tfw_cfg_in_fwd_retries ? : UINT_MAX;
-	sg->flags = tfw_cfg_in_sg_flags;
-	sg->flags |= tfw_cfg_in_retry_nip ? TFW_SRV_RETRY_NIP : 0;
-
-	if (!strcasecmp(tfw_cfg_in_sched->name, "round-robin")) {
-		if (tfw_cfg_sg_ratio_verify(sg, tfw_cfg_in_slst,
-					    tfw_cfg_in_slstsz))
-			return -EINVAL;
-		if (tfw_cfg_sg_ratio_adjust(sg, tfw_cfg_in_slst,
-					    tfw_cfg_in_slstsz))
-			return -EINVAL;
-	}
-	if (tfw_sg_set_sched(sg, tfw_cfg_in_sched->name)) {
-		TFW_ERR_NL("%s %s: Unable to set scheduler: '%s'\n",
-			   cs->name, sg->name, tfw_cfg_in_sched->name);
-		return -EINVAL;
-	}
-	/* Add connections only after a scheduler is set. */
-	for (i = 0; i < tfw_cfg_in_slstsz; ++i) {
-		TfwServer *srv = tfw_cfg_in_slst[i];
-		if (tfw_sock_srv_add_conns(srv, tfw_cfg_in_nconn[i])) {
-			char as[TFW_ADDR_STR_BUF_SIZE] = { 0 };
-			tfw_addr_ntop(&srv->addr, as, sizeof(as));
-			TFW_ERR_NL("%s %s: server '%s': "
-				   "Error adding connections\n",
-				   cs->name, sg->name, as);
-			return -EINVAL;
-		}
-	}
+	BUG_ON(list_empty(&tfw_cfg_sg->srv_list));
+	TFW_DBG("finish srv_group: %s\n", tfw_cfg_sg->name);
 
-	return 0;
+	return tfw_cfgop_setup_srv_group();
 }
 
 static int
@@ -1071,7 +1072,7 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
 
 	if (!ce->val_n) {
 		TFW_ERR_NL("Invalid number of arguments: %zd\n", ce->val_n);
-			return -EINVAL;
+		return -EINVAL;
 	}
 	if (ce->attr_n) {
 		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
@@ -1095,8 +1096,8 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
 static int
 tfw_cfgop_in_sched(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
-	return tfw_cfgop_sched(cs, ce, &tfw_cfg_in_sched,
-				       &tfw_cfg_in_sg_flags);
+	return tfw_cfgop_sched(cs, ce, &tfw_cfg_sched,
+				       &tfw_cfg_sg_flags);
 }
 
 static int
@@ -1112,50 +1113,62 @@ tfw_cfgop_out_sched(TfwCfgSpec *cs, TfwCfgEntry *ce)
 static void
 tfw_clean_srv_groups(TfwCfgSpec *cs)
 {
+	TfwServer *srv, *tmp;
+
+	list_for_each_entry_safe(srv, tmp, &tfw_cfg_in_slst, list) {
+		list_del(&srv->list);
+		tfw_sock_srv_del_conns(srv);
+		tfw_server_destroy(srv);
+	}
+	list_for_each_entry_safe(srv, tmp, &tfw_cfg_out_slst, list) {
+		list_del(&srv->list);
+		tfw_sock_srv_del_conns(srv);
+		tfw_server_destroy(srv);
+	}
+
+	tfw_cfg_sg = tfw_cfg_out_sg = NULL;
+	tfw_cfg_sched = tfw_cfg_out_sched = NULL;
+	tfw_cfg_slstsz = tfw_cfg_out_slstsz = 0;
+	tfw_cfg_sg_flags = tfw_cfg_out_sg_flags = 0;
+
 	tfw_sock_srv_delete_all_conns();
 	tfw_sg_release_all();
-
-	tfw_cfg_in_sg = tfw_cfg_out_sg = NULL;
-	tfw_cfg_in_sched = tfw_cfg_out_sched = NULL;
-	tfw_cfg_in_slstsz = tfw_cfg_out_slstsz = 0;
-	tfw_cfg_in_sg_flags = tfw_cfg_out_sg_flags = 0;
 }
 
 static int
 tfw_sock_srv_start(void)
 {
-	int i, ret;
-	TfwSrvGroup *sg = tfw_cfg_out_sg;
-
-	if (sg) {
-		BUG_ON(!tfw_cfg_out_sched);
-
-		tfw_cfgop_set_conn_retries(sg, tfw_cfg_out_cns_retries);
-		sg->max_qsize = tfw_cfg_out_queue_size ? : UINT_MAX;
-		sg->max_jqage = tfw_cfg_out_fwd_timeout
-			      ? msecs_to_jiffies(tfw_cfg_out_fwd_timeout * 1000)
-			      : ULONG_MAX;
-		sg->max_refwd = tfw_cfg_out_fwd_retries ? : UINT_MAX;
-		sg->flags = tfw_cfg_out_sg_flags;
-		sg->flags |= tfw_cfg_out_retry_nip ? TFW_SRV_RETRY_NIP : 0;
+	int ret;
 
-		if (tfw_sg_set_sched(sg, tfw_cfg_out_sched->name)) {
-			TFW_ERR_NL("srv_group %s: Unable to set scheduler: "
-				   "'%s'\n", sg->name, tfw_cfg_out_sched->name);
+	/*
+	 * The group "default" is created implicitly, and only when
+	 * a server outside of any group is found in the configuration.
+	 */
+	if (tfw_cfg_out_slstsz) {
+		tfw_cfg_out_sg = tfw_sg_new(TFW_CFG_SG_NAME_DEF, GFP_KERNEL);
+		if (!tfw_cfg_out_sg) {
+			TFW_ERR_NL("Unable to add default server group\n");
 			return -EINVAL;
 		}
-		/* Add connections only after a scheduler is set. */
-		for (i = 0; i < tfw_cfg_out_slstsz; ++i) {
-			TfwServer *srv = tfw_cfg_out_slst[i];
-			if (tfw_sock_srv_add_conns(srv, tfw_cfg_out_nconn[i])) {
-				char as[TFW_ADDR_STR_BUF_SIZE] = { 0 };
-				tfw_addr_ntop(&srv->addr, as, sizeof(as));
-				TFW_ERR_NL("srv_group %s: server '%s': "
-					   "Error adding connections\n",
-					   sg->name, as);
-				return -EINVAL;
-			}
-		}
+
+		tfw_cfg_cns_retries = tfw_cfg_out_cns_retries;
+		tfw_cfg_queue_size  = tfw_cfg_out_queue_size;
+		tfw_cfg_fwd_timeout = tfw_cfg_out_fwd_timeout;
+		tfw_cfg_fwd_retries = tfw_cfg_out_fwd_retries;
+		tfw_cfg_retry_nip = tfw_cfg_out_retry_nip;
+		tfw_cfg_sg_flags = tfw_cfg_out_sg_flags;
+		tfw_cfg_slst = &tfw_cfg_out_slst;
+		tfw_cfg_slstsz = tfw_cfg_out_slstsz;
+		tfw_cfg_sched = tfw_cfg_out_sched;
+		tfw_cfg_sg = tfw_cfg_out_sg;
+
+		if ((ret = tfw_cfgop_setup_srv_group()))
+			return ret;
+	}
+	/* Add connections to scheduler for all servers in all groups. */
+	if ((ret = tfw_sg_for_each_srv(tfw_sock_srv_sg_add_conns)) != 0) {
+		TFW_ERR_NL("Error adding server connections\n");
+		return ret;
 	}
 	/*
 	 * This must be executed only after the complete configuration
@@ -1249,35 +1262,35 @@ TfwCfgMod tfw_sock_srv_cfg_mod = {
 			"server_queue_size", NULL,
 			tfw_cfgop_out_queue_size,
 			.allow_none = true,
-			.allow_repeat = true,
+			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
 			"server_forward_timeout", NULL,
 			tfw_cfgop_out_fwd_timeout,
 			.allow_none = true,
-			.allow_repeat = true,
+			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
 			"server_forward_retries", NULL,
 			tfw_cfgop_out_fwd_retries,
 			.allow_none = true,
-			.allow_repeat = true,
+			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
 			"server_retry_non_idempotent", NULL,
 			tfw_cfgop_out_retry_nip,
 			.allow_none = true,
-			.allow_repeat = true,
+			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
 			"server_connect_retries", NULL,
 			tfw_cfgop_out_conn_retries,
 			.allow_none = true,
-			.allow_repeat = true,
+			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
diff --git a/tempesta_fw/t/unit/sched_helper.c b/tempesta_fw/t/unit/sched_helper.c
index d6efc6075..631d4273c 100644
--- a/tempesta_fw/t/unit/sched_helper.c
+++ b/tempesta_fw/t/unit/sched_helper.c
@@ -49,7 +49,7 @@ test_spec_cleanup(TfwCfgSpec specs[])
 }
 
 TfwSrvGroup *
-test_create_sg(const char *name, const char *sched_name)
+test_create_sg(const char *name)
 {
 	TfwSrvGroup *sg;
 
@@ -58,16 +58,31 @@ test_create_sg(const char *name, const char *sched_name)
 	sg = tfw_sg_new(name, GFP_ATOMIC);
 	BUG_ON(!sg);
 
+	sg->max_qsize = 100;
+
+	kernel_fpu_begin();
+
+	return sg;
+}
+
+void
+test_start_sg(TfwSrvGroup *sg, const char *sched_name)
+{
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+
+	kernel_fpu_end();
+
 	{
 		int r = tfw_sg_set_sched(sg, sched_name);
 		BUG_ON(r);
 	}
 
-	sg->max_qsize = 100;
+	list_for_each_entry(srv, &sg->srv_list, list)
+		list_for_each_entry(srv_conn, &srv->conn_list, list)
+			sg->sched->add_conn(sg, srv, srv_conn);
 
 	kernel_fpu_begin();
-
-	return sg;
 }
 
 void
@@ -100,28 +115,30 @@ test_create_srv(const char *in_addr, TfwSrvGroup *sg)
 }
 
 TfwSrvConn *
-test_create_conn(TfwPeer *peer)
+test_create_srv_conn(TfwServer *srv)
 {
 	static struct sock __test_sock = {
 		.sk_state = TCP_ESTABLISHED,
 	};
-	TfwConn *conn;
+	TfwSrvConn *srv_conn;
 
 	kernel_fpu_end();
 
 	if (!tfw_srv_conn_cache)
 		tfw_sock_srv_init();
-	conn = (TfwConn *)tfw_srv_conn_alloc();
-	BUG_ON(!conn);
+	srv_conn = tfw_srv_conn_alloc();
+	BUG_ON(!srv_conn);
 
-	tfw_connection_link_peer(conn, peer);
-	conn->sk = &__test_sock;
+	tfw_connection_link_peer((TfwConn *)srv_conn, (TfwPeer *)srv);
+	srv_conn->sk = &__test_sock;
 	/* A connection is skipped by schedulers if (refcnt <= 0). */
-	tfw_connection_revive(conn);
+	tfw_connection_revive((TfwConn *)srv_conn);
+
+	srv->conn_n++;
 
 	kernel_fpu_begin();
 
-	return (TfwSrvConn *)conn;
+	return srv_conn;
 }
 
 void
@@ -153,7 +170,9 @@ test_sched_generic_empty_sg(struct TestSchedHelper *sched_helper)
 	BUG_ON(!sched_helper->get_sched_arg);
 	BUG_ON(!sched_helper->free_sched_arg);
 
-	sg = test_create_sg("test", sched_helper->sched);
+	sg = test_create_sg("test");
+	sg->flags = sched_helper->flags;
+	test_start_sg(sg, sched_helper->sched);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -178,9 +197,10 @@ test_sched_generic_one_srv_zero_conn(struct TestSchedHelper *sched_helper)
 	BUG_ON(!sched_helper->get_sched_arg);
 	BUG_ON(!sched_helper->free_sched_arg);
 
-	sg = test_create_sg("test", sched_helper->sched);
-
+	sg = test_create_sg("test");
 	test_create_srv("127.0.0.1", sg);
+	sg->flags = sched_helper->flags;
+	test_start_sg(sg, sched_helper->sched);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -205,13 +225,16 @@ test_sched_generic_max_srv_zero_conn(struct TestSchedHelper *sched_helper)
 	BUG_ON(!sched_helper->get_sched_arg);
 	BUG_ON(!sched_helper->free_sched_arg);
 
-	sg = test_create_sg("test", sched_helper->sched);
+	sg = test_create_sg("test");
 
-	for (j = 0; j < TFW_SG_MAX_SRV; ++j)
+	for (j = 0; j < TFW_TEST_SG_MAX_SRV_N; ++j)
 		test_create_srv("127.0.0.1", sg);
 
+	sg->flags = sched_helper->flags;
+	test_start_sg(sg, sched_helper->sched);
+
 	for (i = 0; i < sched_helper->conn_types; ++i) {
-		for (j = 0; j < TFW_SG_MAX_SRV; ++j) {
+		for (j = 0; j < TFW_TEST_SG_MAX_SRV_N; ++j) {
 			TfwMsg *msg = sched_helper->get_sched_arg(i);
 			TfwSrvConn *srv_conn = sg->sched->sched_srv(msg, sg);
 
diff --git a/tempesta_fw/t/unit/sched_helper.h b/tempesta_fw/t/unit/sched_helper.h
index fa49c5dea..fdf816fbc 100644
--- a/tempesta_fw/t/unit/sched_helper.h
+++ b/tempesta_fw/t/unit/sched_helper.h
@@ -26,23 +26,29 @@
 #include "cfg.h"
 #include "connection.h"
 
+#define TFW_TEST_SG_MAX_SRV_N		64
+#define TFW_TEST_SRV_MAX_CONN_N		64
+#define TFW_TEST_SG_MAX_CONN_N		\
+	(TFW_TEST_SG_MAX_SRV_N * TFW_TEST_SRV_MAX_CONN_N)
+
 int tfw_server_init(void);
 int tfw_sched_rr_init(void);
 void sched_helper_init(void);
 
 void test_spec_cleanup(TfwCfgSpec specs[]);
-TfwSrvGroup *test_create_sg(const char *name, const char *sched_name);
+TfwSrvGroup *test_create_sg(const char *name);
+void test_start_sg(TfwSrvGroup *sg, const char *sched_name);
 void test_sg_release_all(void);
 
 TfwServer *test_create_srv(const char *in_addr, TfwSrvGroup *sg);
-
-TfwSrvConn *test_create_conn(TfwPeer *peer);
+TfwSrvConn *test_create_srv_conn(TfwServer *srv);
 
 void test_conn_release_all(TfwSrvGroup *sg);
 
 struct TestSchedHelper {
 	const char *sched;
 	size_t conn_types;
+	unsigned int flags;
 	TfwMsg *(*get_sched_arg)(size_t conn_type);
 	void (*free_sched_arg)(TfwMsg *);
 };
diff --git a/tempesta_fw/t/unit/test_sched_hash.c b/tempesta_fw/t/unit/test_sched_hash.c
index fd40669c4..0c1bb1a7f 100644
--- a/tempesta_fw/t/unit/test_sched_hash.c
+++ b/tempesta_fw/t/unit/test_sched_hash.c
@@ -53,12 +53,7 @@ static char *req_strs[] = {
 };
 
 static TfwMsg *sched_hash_get_arg(size_t conn_type);
-
-static void
-sched_hash_free_arg(TfwMsg *msg)
-{
-	test_req_free((TfwHttpReq *)msg);
-}
+static void sched_hash_free_arg(TfwMsg *msg);
 
 static struct TestSchedHelper sched_helper_hash = {
 	.sched = "hash",
@@ -67,6 +62,12 @@ static struct TestSchedHelper sched_helper_hash = {
 	.free_sched_arg = &sched_hash_free_arg,
 };
 
+static void
+sched_hash_free_arg(TfwMsg *msg)
+{
+	test_req_free((TfwHttpReq *)msg);
+}
+
 static TfwMsg *
 sched_hash_get_arg(size_t conn_type)
 {
@@ -103,20 +104,20 @@ TEST(tfw_sched_hash, one_srv_in_sg_and_max_conn)
 {
 	size_t i, j;
 
-	TfwSrvGroup *sg = test_create_sg("test", sched_helper_hash.sched);
+	TfwSrvGroup *sg = test_create_sg("test");
 	TfwServer *srv = test_create_srv("127.0.0.1", sg);
 
-	for (i = 0; i < TFW_SRV_MAX_CONN; ++i) {
-		TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv);
-		sg->sched->add_conn(sg, srv, srv_conn);
-	}
+	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i)
+		test_create_srv_conn(srv);
+
+	test_start_sg(sg, sched_helper_hash.sched);
 
 	/* Check that every request is scheduled to the same connection. */
 	for (i = 0; i < sched_helper_hash.conn_types; ++i) {
 		TfwSrvConn *expect_conn = NULL;
+		TfwMsg *msg = sched_helper_hash.get_sched_arg(i);
 
-		for (j = 0; j < TFW_SRV_MAX_CONN; ++j) {
-			TfwMsg *msg = sched_helper_hash.get_sched_arg(i);
+		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
 			TfwSrvConn *srv_conn = sg->sched->sched_srv(msg, sg);
 			EXPECT_NOT_NULL(srv_conn);
 
@@ -126,8 +127,8 @@ TEST(tfw_sched_hash, one_srv_in_sg_and_max_conn)
 				EXPECT_EQ(srv_conn, expect_conn);
 
 			tfw_srv_conn_put(srv_conn);
-			sched_helper_hash.free_sched_arg(msg);
 		}
+		sched_helper_hash.free_sched_arg(msg);
 	}
 
 	test_conn_release_all(sg);
@@ -148,25 +149,25 @@ TEST(tfw_sched_hash, max_srv_in_sg_and_zero_conn)
  */
 TEST(tfw_sched_hash, max_srv_in_sg_and_max_conn)
 {
-	size_t i, j;
+	unsigned long i, j;
 
-	TfwSrvGroup *sg = test_create_sg("test", sched_helper_hash.sched);
+	TfwSrvGroup *sg = test_create_sg("test");
 
-	for (i = 0; i < TFW_SG_MAX_SRV; ++i) {
+	for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) {
 		TfwServer *srv = test_create_srv("127.0.0.1", sg);
 
-		for (j = 0; j < TFW_SRV_MAX_CONN; ++j) {
-			TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv);
-			sg->sched->add_conn(sg, srv, srv_conn);
-		}
+		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j)
+			test_create_srv_conn(srv);
 	}
 
+	test_start_sg(sg, sched_helper_hash.sched);
+
 	/* Check that every request is scheduled to the same connection. */
 	for (i = 0; i < sched_helper_hash.conn_types; ++i) {
 		TfwSrvConn *expect_conn = NULL;
+		TfwMsg *msg = sched_helper_hash.get_sched_arg(i);
 
-		for (j = 0; j < TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN; ++j) {
-			TfwMsg *msg = sched_helper_hash.get_sched_arg(i);
+		for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) {
 			TfwSrvConn *srv_conn = sg->sched->sched_srv(msg, sg);
 			EXPECT_NOT_NULL(srv_conn);
 
@@ -176,8 +177,8 @@ TEST(tfw_sched_hash, max_srv_in_sg_and_max_conn)
 				EXPECT_EQ(srv_conn, expect_conn);
 
 			tfw_srv_conn_put(srv_conn);
-			sched_helper_hash.free_sched_arg(msg);
 		}
+		sched_helper_hash.free_sched_arg(msg);
 	}
 
 	test_conn_release_all(sg);
diff --git a/tempesta_fw/t/unit/test_sched_http.c b/tempesta_fw/t/unit/test_sched_http.c
index 426a051b5..ad92b05f6 100644
--- a/tempesta_fw/t/unit/test_sched_http.c
+++ b/tempesta_fw/t/unit/test_sched_http.c
@@ -110,7 +110,9 @@ TEST(tfw_sched_http, zero_rules_and_zero_conns)
 
 TEST(tfw_sched_http, one_rule_and_zero_conns)
 {
-	test_create_sg("default", "round-robin");
+	TfwSrvGroup *sg = test_create_sg("default");
+	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg, "round-robin");
 
 	if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) {
 		TEST_FAIL("can't parse rules\n");
@@ -128,10 +130,11 @@ TEST(tfw_sched_http, one_wildcard_rule)
 	TfwServer *srv;
 	TfwSrvConn *expect_conn;
 
-	sg = test_create_sg("default", "round-robin");
+	sg = test_create_sg("default");
 	srv = test_create_srv("127.0.0.1", sg);
-	expect_conn = test_create_conn((TfwPeer *)srv);
-	sg->sched->add_conn(sg, srv, expect_conn);
+	expect_conn = test_create_srv_conn(srv);
+	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg, "round-robin");
 
 	if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) {
 		TEST_FAIL("can't parse rules\n");
@@ -153,55 +156,65 @@ TEST(tfw_sched_http, some_rules)
 		   *expect_conn5, *expect_conn6, *expect_conn7, *expect_conn8,
 		   *expect_conn9, *expect_conn10;
 
-	sg1 = test_create_sg("sg1", "round-robin");
+	sg1 = test_create_sg("sg1");
 	srv = test_create_srv("127.0.0.1", sg1);
-	expect_conn1 = test_create_conn((TfwPeer *)srv);
-	sg1->sched->add_conn(sg1, srv, expect_conn1);
+	expect_conn1 = test_create_srv_conn(srv);
+	sg1->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg1, "round-robin");
 
-	sg2 = test_create_sg("sg2", "round-robin");
+	sg2 = test_create_sg("sg2");
 	srv = test_create_srv("127.0.0.1", sg2);
-	expect_conn2 = test_create_conn((TfwPeer *)srv);
-	sg2->sched->add_conn(sg2, srv, expect_conn2);
+	expect_conn2 = test_create_srv_conn(srv);
+	sg2->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg2, "round-robin");
 
-	sg3 = test_create_sg("sg3", "round-robin");
+	sg3 = test_create_sg("sg3");
 	srv = test_create_srv("127.0.0.1", sg3);
-	expect_conn3 = test_create_conn((TfwPeer *)srv);
-	sg3->sched->add_conn(sg3, srv, expect_conn3);
+	expect_conn3 = test_create_srv_conn(srv);
+	sg3->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg3, "round-robin");
 
-	sg4 = test_create_sg("sg4", "round-robin");
+	sg4 = test_create_sg("sg4");
 	srv = test_create_srv("127.0.0.1", sg4);
-	expect_conn4 = test_create_conn((TfwPeer *)srv);
-	sg4->sched->add_conn(sg4, srv, expect_conn4);
+	expect_conn4 = test_create_srv_conn(srv);
+	sg4->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg4, "round-robin");
 
-	sg5 = test_create_sg("sg5", "round-robin");
+	sg5 = test_create_sg("sg5");
 	srv = test_create_srv("127.0.0.1", sg5);
-	expect_conn5 = test_create_conn((TfwPeer *)srv);
-	sg5->sched->add_conn(sg5, srv, expect_conn5);
+	expect_conn5 = test_create_srv_conn(srv);
+	sg5->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg5, "round-robin");
 
-	sg6 = test_create_sg("sg6", "round-robin");
+	sg6 = test_create_sg("sg6");
 	srv = test_create_srv("127.0.0.1", sg6);
-	expect_conn6 = test_create_conn((TfwPeer *)srv);
-	sg6->sched->add_conn(sg6, srv, expect_conn6);
+	expect_conn6 = test_create_srv_conn(srv);
+	sg6->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg6, "round-robin");
 
-	sg7 = test_create_sg("sg7", "round-robin");
+	sg7 = test_create_sg("sg7");
 	srv = test_create_srv("127.0.0.1", sg7);
-	expect_conn7 = test_create_conn((TfwPeer *)srv);
-	sg7->sched->add_conn(sg7, srv, expect_conn7);
+	expect_conn7 = test_create_srv_conn(srv);
+	sg7->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg7, "round-robin");
 
-	sg8 = test_create_sg("sg8", "round-robin");
+	sg8 = test_create_sg("sg8");
 	srv = test_create_srv("127.0.0.1", sg8);
-	expect_conn8 = test_create_conn((TfwPeer *)srv);
-	sg8->sched->add_conn(sg8, srv, expect_conn8);
+	expect_conn8 = test_create_srv_conn(srv);
+	sg8->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg8, "round-robin");
 
-	sg9 = test_create_sg("sg9", "round-robin");
+	sg9 = test_create_sg("sg9");
 	srv = test_create_srv("127.0.0.1", sg9);
-	expect_conn9 = test_create_conn((TfwPeer *)srv);
-	sg9->sched->add_conn(sg9, srv, expect_conn9);
+	expect_conn9 = test_create_srv_conn(srv);
+	sg9->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg9, "round-robin");
 
-	sg10 = test_create_sg("sg10", "round-robin");
+	sg10 = test_create_sg("sg10");
 	srv = test_create_srv("127.0.0.1", sg10);
-	expect_conn10 = test_create_conn((TfwPeer *)srv);
-	sg10->sched->add_conn(sg10, srv, expect_conn10);
+	expect_conn10 = test_create_srv_conn(srv);
+	sg10->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg10, "round-robin");
 
 	if (parse_cfg("sched_http_rules {\nmatch sg1 uri eq /foo;\n\
 	                                   match sg2 uri prefix /foo/bar;\n\
@@ -313,10 +326,11 @@ TEST(tfw_sched_http, one_rule)
 		TfwServer *srv;
 		TfwSrvConn *expect_conn;
 
-		sg = test_create_sg("default", "round-robin");
+		sg = test_create_sg("default");
 		srv = test_create_srv("127.0.0.1", sg);
-		expect_conn = test_create_conn((TfwPeer *)srv);
-		sg->sched->add_conn(sg, srv, expect_conn);
+		expect_conn = test_create_srv_conn(srv);
+		sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+		test_start_sg(sg, "round-robin");
 
 		if (parse_cfg(test_cases[i].rule_str)) {
 			TEST_FAIL("can't parse rules\n");
diff --git a/tempesta_fw/t/unit/test_sched_rr.c b/tempesta_fw/t/unit/test_sched_rr.c
index 2202f21e5..188592256 100644
--- a/tempesta_fw/t/unit/test_sched_rr.c
+++ b/tempesta_fw/t/unit/test_sched_rr.c
@@ -58,6 +58,7 @@ sched_rr_free_arg(TfwMsg *msg __attribute__((unused)))
 static struct TestSchedHelper sched_helper_rr = {
 	.sched = "round-robin",
 	.conn_types = 1,
+	.flags = TFW_SG_F_SCHED_RATIO_STATIC,
 	.get_sched_arg = &sched_rr_get_arg,
 	.free_sched_arg = &sched_rr_free_arg,
 };
@@ -84,32 +85,34 @@ TEST(tfw_sched_rr, one_srv_in_sg_and_max_conn)
 	size_t i, j;
 	long long conn_acc = 0, conn_acc_check = 0;
 
-	TfwSrvGroup *sg = test_create_sg("test", sched_helper_rr.sched);
+	TfwSrvGroup *sg = test_create_sg("test");
 	TfwServer *srv = test_create_srv("127.0.0.1", sg);
 
-	for (i = 0; i < TFW_SRV_MAX_CONN; ++i) {
-		TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv);
-		sg->sched->add_conn(sg, srv, srv_conn);
+	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) {
+		TfwSrvConn *srv_conn = test_create_srv_conn(srv);
 		conn_acc ^= (long long)srv_conn;
 	}
 
+	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg, sched_helper_rr.sched);
+
 	/*
 	 * Check that connections is scheduled in the fair way:
 	 * every connection will be scheduled only once
 	 */
 	for (i = 0; i < sched_helper_rr.conn_types; ++i) {
+		TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
 		conn_acc_check = 0;
 
-		for (j = 0; j < TFW_SRV_MAX_CONN; ++j) {
-			TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
+		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
 			TfwSrvConn *srv_conn = sg->sched->sched_srv(msg, sg);
 			EXPECT_NOT_NULL(srv_conn);
 
 			conn_acc_check ^= (long long)srv_conn;
 			tfw_srv_conn_put(srv_conn);
-			sched_helper_rr.free_sched_arg(msg);
 		}
 
+		sched_helper_rr.free_sched_arg(msg);
 		EXPECT_EQ(conn_acc, conn_acc_check);
 	}
 
@@ -131,38 +134,40 @@ TEST(tfw_sched_rr, max_srv_in_sg_and_zero_conn)
  */
 TEST(tfw_sched_rr, max_srv_in_sg_and_max_conn)
 {
-	size_t i, j;
+	unsigned long i, j;
 	long long conn_acc = 0, conn_acc_check = 0;
 
-	TfwSrvGroup *sg = test_create_sg("test", sched_helper_rr.sched);
+	TfwSrvGroup *sg = test_create_sg("test");
 
-	for (i = 0; i < TFW_SG_MAX_SRV; ++i) {
+	for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) {
 		TfwServer *srv = test_create_srv("127.0.0.1", sg);
 
-		for (j = 0; j < TFW_SRV_MAX_CONN; ++j) {
-			TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv);
-			sg->sched->add_conn(sg, srv, srv_conn);
+		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
+			TfwSrvConn *srv_conn = test_create_srv_conn(srv);
 			conn_acc ^= (long long)srv_conn;
 		}
 	}
 
+	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	test_start_sg(sg, sched_helper_rr.sched);
+
 	/*
 	 * Check that connections is scheduled in the fair way:
 	 * every connection will be scheduled only once
 	 */
 	for (i = 0; i < sched_helper_rr.conn_types; ++i) {
+		TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
 		conn_acc_check = 0;
 
-		for (j = 0; j < TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN; ++j) {
-			TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
+		for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) {
 			TfwSrvConn *srv_conn = sg->sched->sched_srv(msg, sg);
 			EXPECT_NOT_NULL(srv_conn);
 
 			conn_acc_check ^= (long long)srv_conn;
 			tfw_srv_conn_put(srv_conn);
-			sched_helper_rr.free_sched_arg(msg);
 		}
 
+		sched_helper_rr.free_sched_arg(msg);
 		EXPECT_EQ(conn_acc, conn_acc_check);
 	}
 

From 183af756ff9d32d715ab57960a23d327df3e8e12 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 27 Mar 2017 01:37:06 +0300
Subject: [PATCH 06/37] Initial barebone version of ratio scheduler.

This implements the complete data structure layout. Internal ratio
scheduler data is allocated dynamically and populated with server
group data required for the scheduler's functionality.
---
 tempesta_fw/sched/Makefile          |   2 +-
 tempesta_fw/sched/tfw_sched_ratio.c | 333 ++++++++++++++++++++++++++++
 tempesta_fw/sock_srv.c              |   2 +-
 3 files changed, 335 insertions(+), 2 deletions(-)
 create mode 100644 tempesta_fw/sched/tfw_sched_ratio.c

diff --git a/tempesta_fw/sched/Makefile b/tempesta_fw/sched/Makefile
index a2f805f53..3d75df7e5 100644
--- a/tempesta_fw/sched/Makefile
+++ b/tempesta_fw/sched/Makefile
@@ -20,4 +20,4 @@
 EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/../ -I$(src)/../../tempesta_db/core
 EXTRA_CFLAGS += $(TTLS_CFLAGS)
 
-obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_rr.o
+obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_ratio.o tfw_sched_rr.o
diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
new file mode 100644
index 000000000..100ea76c2
--- /dev/null
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -0,0 +1,333 @@
+/**
+ *              Tempesta FW
+ *
+ * Copyright (C) 2017 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "tempesta_fw.h"
+#include "log.h"
+#include "server.h"
+
+MODULE_AUTHOR(TFW_AUTHOR);
+MODULE_DESCRIPTION("Tempesta Ratio Scheduler");
+MODULE_VERSION("0.3.0");
+MODULE_LICENSE("GPL");
+
+/**
+ * Individual upstream server descriptor.
+ *
+ * Connections may go up or down during failover process.
+ * Only fully established connections are considered by scheduler.
+ *
+ * @conn_n	- number of connections to server.
+ * @srv		- pointer to server structure.
+ * @conns	- list of pointers to server connection structures.
+ * @counter	- monotonic counter for choosing the next connection.
+ */
+typedef struct {
+	size_t		conn_n;
+	TfwServer	*srv;
+	TfwSrvConn	**conns;
+	atomic64_t	counter;
+} TfwRatioSrv;
+
+/**
+ * Server data for scheduler.
+ *
+ * @sidx	- server id this data is for.
+ * @weight	- server weight.
+ * @osratio	- original server ratio.
+ * @csratio	- current server ratio.
+ */
+typedef struct {
+	size_t		sidx;
+	unsigned int	weight;
+	unsigned int	osratio;
+	unsigned int	csratio;
+} TfwRatioSrvData;
+
+/**
+ * Scheduler iteration data.
+ *
+ * @lock	- must be in the same cache line for faster operations.
+ * @csidx	- current server id.
+ * @rearm       - next server id which ratio we need to re-arm, or @srv_n
+ *		  if no re-arming is needed.
+ * @riter	- ratio iteration, indicates the number of times we need
+ *		  to choose all servers before the current one until we
+ *		  can choose the current server.
+ * @crsum	- current sum of all ratios, used to avoid scanning the
+ *		  list of servers with fully zeroed ratios.
+ * @orsum	- original sum of all ratios, used to re-arm @crsum.
+ */
+typedef struct {
+	spinlock_t	lock;
+	size_t		csidx;
+	size_t		rearm;
+	unsigned int    riter;
+	unsigned int    crsum;
+	unsigned int    orsum;
+} TfwRatioSchedData;
+
+/**
+ * Scheduler data.
+ */
+typedef struct {
+	TfwRatioSrvData		*srvdata;
+	TfwRatioSchedData	schdata;
+} TfwRatioSched;
+
+/**
+ * The main Ratio Scheduler structure.
+ *
+ * All servers, either dead or live, are present in the list during
+ * the whole run-time. That may change in the future.
+ *
+ * @srv_n	- number of upstream servers.
+ * @sched	- scheduler data.
+ * @srvs	- array of upstream server descriptors, shared between
+ *		  RCU pool entries.
+ */
+typedef struct {
+	struct rcu_head	rcu;
+	size_t		srv_n;
+	TfwRatioSched	sched;
+	TfwRatioSrv	*srvs;
+} TfwRatio;
+
+/**
+ * The pool of TfwRatio{} structures for RCU.
+ *
+ * @pool	- pool of TfwRatio{} for RCU.
+ * @ratio	- pointer to the currently used structure.
+ */
+typedef struct {
+	TfwRatio	*rpool;
+	TfwRatio __rcu	*ratio;
+} TfwRatioPool;
+
+/**
+ * Release Ratio Scheduler data from a server group.
+ */
+static void
+tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
+{
+	size_t i;
+	TfwRatio *ratio;
+	TfwRatioPool *rpool = sg->sched_data;
+
+	if (!rpool)
+		return;
+
+	/* Free the data that is shared in the pool. */
+	ratio = rpool->ratio;
+	for (i = 0; i < sg->srv_n; ++i)
+		if (ratio->srvs[i].conns)
+			kfree(ratio->srvs[i].conns);
+	kfree(ratio->srvs);
+
+	/* Free the data that is unique for each pool entry. */
+	for (i = 0, ratio = rpool->rpool; i <= nr_cpu_ids; ++i, ++ratio)
+		if (ratio->sched.srvdata)
+			kfree(ratio->sched.srvdata);
+
+	kfree(rpool);
+	sg->sched_data = NULL;
+}
+
+/**
+ * Delete a server group from Ratio Scheduler.
+ */
+static void
+tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
+{
+	tfw_sched_ratio_cleanup(sg);
+}
+
+/**
+ * Add a server group to Ratio Scheduler.
+ *
+ * At the time this function is called the server group is fully formed
+ * and populated with all servers and connections.
+ */
+static int
+tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
+{
+	int ret = -ENOMEM;
+	size_t size, srv_i;
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+	TfwRatioPool *rpool;
+	TfwRatio *ratio;
+	TfwRatioSrv *rsrv;
+
+	/*
+	 * Validate the number of servers in the group, and the number
+	 * of connections for each server.
+	 */
+	srv_i = 0;
+	list_for_each_entry(srv, &sg->srv_list, list) {
+		size_t conn_i = 0;
+		list_for_each_entry(srv_conn, &srv->conn_list, list)
+			++conn_i;
+		if (conn_i > srv->conn_n)
+			return -EINVAL;
+		++srv_i;
+	}
+	if (srv_i > sg->srv_n)
+		return -EINVAL;
+
+	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
+	size = sizeof(TfwRatioPool) + sizeof(TfwRatio) * (nr_cpu_ids + 1);
+	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
+		return -ENOMEM;
+	rpool = sg->sched_data;
+	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
+	rpool->ratio = rpool->rpool;
+	ratio = rpool->ratio;
+
+	/* Array to hold server descriptors. */
+	size = sizeof(TfwRatioSrv) * sg->srv_n;
+	if (!(ratio->srvs = kzalloc(size, GFP_KERNEL)))
+		goto cleanup;
+
+	/* Array to hold server data for scheduler. */
+	size = sizeof(TfwRatioSrvData) * sg->srv_n;
+	if (!(ratio->sched.srvdata = kzalloc(size, GFP_KERNEL)))
+		goto cleanup;
+	spin_lock_init(&ratio->sched.schdata.lock);
+
+	/* Initial setup of upstream server descriptors. */
+	srv_i = 0;
+	rsrv = ratio->srvs;
+	list_for_each_entry(srv, &sg->srv_list, list) {
+		size_t conn_i = 0;
+		size = sizeof(TfwSrvConn *) * srv->conn_n;
+		if (!(rsrv->conns = kzalloc(size, GFP_KERNEL)))
+			goto cleanup;
+		rsrv->srv = srv;
+		rsrv->conn_n = srv->conn_n;
+		atomic64_set(&rsrv->counter, 0);
+		list_for_each_entry(srv_conn, &srv->conn_list, list)
+			rsrv->conns[conn_i++] = srv_conn;
+		ratio->sched.srvdata[srv_i].weight = srv->weight;
+		++rsrv;
+		++srv_i;
+	}
+
+	/* Set up the initial ratio data. */
+	if (sg->flags & TFW_SG_F_SCHED_RATIO_STATIC)
+		printk(KERN_ERR "ratio static.\n");
+	else if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC)
+		printk(KERN_ERR "ratio dynamic: %d\n",
+				sg->flags & TFW_SG_F_PSTATS_IDX_MASK);
+	else
+		BUG();
+
+	return 0;
+
+cleanup:
+	tfw_sched_ratio_cleanup(sg);
+	return ret;
+}
+
+/**
+ * Add a connection and a server, if new, to the scheduler.
+ * Called at configuration stage, no synchronization is required.
+ *
+ * The whole server and server connections data for a group is complete
+ * at the time the group is added to the scheduler with add_grp(). Thus
+ * the actual role of the function is to make cure that data is the same.
+ */
+static void
+tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
+{
+	static size_t srv_i = 0, conn_i = 0;
+	TfwRatioPool *rpool = sg->sched_data;
+	TfwRatio *ratio;
+	TfwRatioSrv *rsrv;
+	TfwSrvConn *rconn;
+
+	BUG_ON(!rpool);
+	ratio = rpool->ratio;
+
+	/* Make sure that data is the same. */
+	rsrv = ratio->srvs + srv_i;
+	BUG_ON(rsrv->srv != srv);
+
+	rconn = rsrv->conns[conn_i];
+	BUG_ON(rconn != srv_conn);
+
+	if (++conn_i == srv->conn_n) {
+		conn_i = 0;
+		if (++srv_i == sg->srv_n)
+			srv_i = 0;
+	}
+}
+
+/**
+ * On each subsequent call the function returns the next available
+ * connection to one of the servers in the group. Connections to a
+ * server are rotated in pure round-robin fashion.
+ *
+ * A server is chosen according to its current weight that can be
+ * either static or dynamic. Servers with greater weight are chosen
+ * more often than servers with lesser weight.
+ *
+ * Dead connections and servers w/o live connections are skipped.
+ * Initially, connections with non-idempotent requests are also skipped
+ * in attempt to increase throughput. However, if all live connections
+ * contain a non-idempotent request, then re-run the algorithm and get
+ * the first live connection they way it is usually done.
+ *
+ * Ratio scheduler must be the fastest scheduler. Also, it's essential
+ * to maintain a completely fair distribution of requests to servers
+ * according to servers weights.
+ */
+static TfwSrvConn *
+tfw_sched_ratio_sched_srv(TfwMsg *msg, TfwSrvGroup *sg)
+{
+	printk(KERN_ERR "%s scheduler called.\n", sg->sched->name);
+	return NULL;
+}
+
+static TfwScheduler tfw_sched_ratio = {
+	.name		= "ratio",
+	.list		= LIST_HEAD_INIT(tfw_sched_ratio.list),
+	.add_grp	= tfw_sched_ratio_add_grp,
+	.del_grp	= tfw_sched_ratio_del_grp,
+	.add_conn	= tfw_sched_ratio_add_conn,
+	.sched_srv	= tfw_sched_ratio_sched_srv,
+};
+
+int
+tfw_sched_ratio_init(void)
+{
+	TFW_DBG("%s: init\n", tfw_sched_ratio.name);
+	return tfw_sched_register(&tfw_sched_ratio);
+}
+module_init(tfw_sched_ratio_init);
+
+void
+tfw_sched_ratio_exit(void)
+{
+	TFW_DBG("%s: exit\n", tfw_sched_ratio.name);
+	tfw_sched_unregister(&tfw_sched_ratio);
+}
+module_exit(tfw_sched_ratio_exit);
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 865be77d3..9a642b185 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -1084,7 +1084,7 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
 		return -EINVAL;
 	}
 
-	if (!strcasecmp(sched->name, "round-robin"))
+	if (!strcasecmp(sched->name, "ratio"))
 		if (tfw_cfg_handle_ratio(cs, ce, sg_flags))
 			return -EINVAL;
 

From aa8f44f8be3cb53edec14f98fa8c2f3dcd63455f Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 27 Mar 2017 16:25:11 +0300
Subject: [PATCH 07/37] Incorporate ALB algorithm suggested by @krizhanovsky.

For dynamic weights initialize the weight of each server in a group
to the default value. That makes their weights equal initially.

At this time only static equal weights are supported.
---
 tempesta_fw/sched/tfw_sched_ratio.c | 166 ++++++++++++++++++++++++++--
 tempesta_fw/sock_srv.c              |  16 +--
 2 files changed, 165 insertions(+), 17 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 100ea76c2..4411fdfd8 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -52,14 +52,14 @@ typedef struct {
  *
  * @sidx	- server id this data is for.
  * @weight	- server weight.
- * @osratio	- original server ratio.
- * @csratio	- current server ratio.
+ * @cratio	- current server ratio.
+ * @oratio	- original server ratio.
  */
 typedef struct {
 	size_t		sidx;
 	unsigned int	weight;
-	unsigned int	osratio;
-	unsigned int	csratio;
+	unsigned int	cratio;
+	unsigned int	oratio;
 } TfwRatioSrvData;
 
 /**
@@ -80,9 +80,9 @@ typedef struct {
 	spinlock_t	lock;
 	size_t		csidx;
 	size_t		rearm;
-	unsigned int    riter;
-	unsigned int    crsum;
-	unsigned int    orsum;
+	unsigned int	riter;
+	unsigned long	crsum;
+	unsigned long	orsum;
 } TfwRatioSchedData;
 
 /**
@@ -160,6 +160,40 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 	tfw_sched_ratio_cleanup(sg);
 }
 
+/**
+ * Set up initial or static ratios for all servers in the group.
+ */
+static void
+tfw_sched_ratio_set_static(TfwRatio *ratio)
+{
+	size_t srv_i;
+	unsigned int diff = 0, wequal;
+
+	BUG_ON(!ratio);
+	wequal = ratio->srvs[0].srv->weight;
+
+	for (srv_i = 0; srv_i < ratio->srv_n; ++srv_i) {
+		unsigned int weight_i = ratio->srvs[srv_i].srv->weight;
+		ratio->sched.srvdata[srv_i].sidx = srv_i;
+		ratio->sched.srvdata[srv_i].weight = weight_i;
+		diff |= (wequal != weight_i);
+	}
+	if (!diff) {
+		for (srv_i = 0; srv_i < ratio->srv_n; ++srv_i) {
+			ratio->sched.srvdata[srv_i].cratio =
+			ratio->sched.srvdata[srv_i].oratio = 1;
+		}
+		ratio->sched.schdata.csidx = 0;
+		ratio->sched.schdata.riter = 1;
+		ratio->sched.schdata.rearm = ratio->srv_n;
+		ratio->sched.schdata.crsum =
+		ratio->sched.schdata.orsum = ratio->srv_n;
+		return;
+	}
+	printk(KERN_ERR "%s: Different weights are not supported yet.\n",
+			__func__);
+}
+
 /**
  * Add a server group to Ratio Scheduler.
  *
@@ -226,19 +260,21 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 		atomic64_set(&rsrv->counter, 0);
 		list_for_each_entry(srv_conn, &srv->conn_list, list)
 			rsrv->conns[conn_i++] = srv_conn;
-		ratio->sched.srvdata[srv_i].weight = srv->weight;
 		++rsrv;
 		++srv_i;
 	}
+	ratio->srv_n = sg->srv_n;
 
 	/* Set up the initial ratio data. */
+	if (!(sg->flags & (TFW_SG_F_SCHED_RATIO_STATIC
+			   | TFW_SG_F_SCHED_RATIO_DYNAMIC)))
+		BUG();
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_STATIC)
 		printk(KERN_ERR "ratio static.\n");
 	else if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC)
 		printk(KERN_ERR "ratio dynamic: %d\n",
 				sg->flags & TFW_SG_F_PSTATS_IDX_MASK);
-	else
-		BUG();
+	tfw_sched_ratio_set_static(ratio);
 
 	return 0;
 
@@ -254,6 +290,8 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
  * The whole server and server connections data for a group is complete
  * at the time the group is added to the scheduler with add_grp(). Thus
  * the actual role of the function is to make cure that data is the same.
+ * The logic is based on the assumption that servers and connections are
+ * submitted in the same order as they were when add_grp() was called.
  */
 static void
 tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
@@ -281,6 +319,88 @@ tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
 	}
 }
 
+static inline bool
+tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, size_t csidx)
+{
+	unsigned int headsum2, tailsum2;
+	TfwRatioSrvData *srvdata = ratio->sched.srvdata;
+	TfwRatioSchedData *schdata = &ratio->sched.schdata;
+
+	if (!csidx)
+		return true;
+	headsum2 = (srvdata[0].cratio + srvdata[csidx - 1].cratio) * csidx;
+	tailsum2 = (srvdata[csidx].cratio
+		    + (srvdata[ratio->srv_n - 1].cratio
+		       ? : srvdata[ratio->srv_n - 1].cratio))
+		   * (ratio->srv_n - csidx);
+	return tailsum2 * schdata->riter > headsum2;
+}
+
+/*
+ * Get the index of the next server
+ *
+ * The function is synchronized by a plain spin lock. A lock-free
+ * implementation of the algorithm as it is would require too many
+ * atomic operations including CMPXCHG and checking loops, so it seems
+ * we won't win anything.
+ */
+static size_t
+tfw_sched_ratio_next_srv(TfwRatio *ratio)
+{
+	size_t csidx;
+	TfwRatioSrvData *srvdata = ratio->sched.srvdata;
+	TfwRatioSchedData *schdata = &ratio->sched.schdata;
+
+	spin_lock(&schdata->lock);
+retry:
+	csidx = schdata->csidx;
+	if (!srvdata[csidx].cratio) {
+		if (schdata->rearm != csidx) {
+			++schdata->csidx;
+			if (schdata->csidx == ratio->srv_n) {
+				schdata->csidx = 0;
+				schdata->riter = 1;
+			}
+			goto retry;
+		}
+		srvdata[csidx].cratio = srvdata[csidx].oratio;
+		++schdata->rearm;
+	}
+	/*
+	 * If it's the turn of the current server then take off a point
+	 * from the server's current ratio (decrement it). Then prepare
+	 * for the next time this function is called. If ratios of all
+	 * servers got down to zero, then rearm everything and start
+	 * from the beginning. Otherwise, if it's the last server in
+	 * the group, then also start from the beginning, but do not
+	 * re-arm as it's been re-armed already (make sure of that).
+	 */
+	if (likely(tfw_sched_ratio_is_srv_turn(ratio, csidx))) {
+		--srvdata[csidx].cratio;
+		if (unlikely(!--schdata->crsum)) {
+			schdata->csidx = 0;
+			schdata->riter = 1;
+			schdata->crsum = schdata->orsum;
+			schdata->rearm = 0;
+		} else if (unlikely(++schdata->csidx == ratio->srv_n)) {
+			BUG_ON(schdata->rearm != ratio->srv_n);
+			schdata->csidx = 0;
+			schdata->riter = 1;
+		}
+		spin_unlock(&schdata->lock);
+		return csidx;
+	}
+	/*
+	 * This is not the turn of the current server. Start
+	 * a new iteration from the server with highest ratio.
+	 */
+	schdata->csidx = 0;
+	++schdata->riter;
+	goto retry;
+
+	spin_unlock(&schdata->lock);
+}
+
 /**
  * On each subsequent call the function returns the next available
  * connection to one of the servers in the group. Connections to a
@@ -303,7 +423,33 @@ tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
 static TfwSrvConn *
 tfw_sched_ratio_sched_srv(TfwMsg *msg, TfwSrvGroup *sg)
 {
+	uint64_t idxval;
+	size_t csidx;
+	TfwRatioPool *rpool = sg->sched_data;
+	TfwRatio *ratio;
+	TfwRatioSrv *rsrv;
+	TfwSrvConn *srv_conn;
+
 	printk(KERN_ERR "%s scheduler called.\n", sg->sched->name);
+	BUG_ON(!rpool);
+
+	rcu_read_lock();
+	ratio = rcu_dereference(rpool->ratio);
+	BUG_ON(!ratio);
+
+	csidx = tfw_sched_ratio_next_srv(ratio);
+	rsrv = &ratio->srvs[csidx];
+	idxval = atomic64_inc_return(&rsrv->counter);
+	srv_conn = rsrv->conns[idxval % rsrv->conn_n];
+	if (tfw_srv_conn_get_if_live(srv_conn)) {
+		printk(KERN_ERR "%s: sched srv=[%zd] conn=[%zd]\n",
+				__func__, csidx,
+				(size_t)(idxval % rsrv->conn_n));
+		rcu_read_unlock();
+		return(srv_conn);
+	}
+
+	rcu_read_unlock();
 	return NULL;
 }
 
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 9a642b185..43a6fc2b6 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -902,12 +902,9 @@ tfw_cfg_sg_ratio_adjust(TfwSrvGroup *sg)
 {
 	TfwServer *srv;
 
-	if (sg->flags & TFW_SG_F_SCHED_RATIO_STATIC) {
-		list_for_each_entry(srv, tfw_cfg_slst, list)
-			if (!srv->weight)
-				srv->weight = TFW_CFG_SRV_WEIGHT_DEF;
-	}
-
+	list_for_each_entry(srv, tfw_cfg_slst, list)
+		if (!srv->weight)
+			srv->weight = TFW_CFG_SRV_WEIGHT_DEF;
 	return 0;
 }
 
@@ -953,7 +950,12 @@ tfw_cfgop_setup_srv_group(void)
 	tfw_cfg_sg->flags = tfw_cfg_sg_flags;
 	tfw_cfg_sg->flags |= tfw_cfg_retry_nip ? TFW_SRV_RETRY_NIP : 0;
 
-	/* Check 'ratio' scheduler configuration for incompatibilities. */
+	/*
+	 * Check 'ratio' scheduler configuration for incompatibilities.
+	 * Set weight to default value for each server in the group
+	 * if no weight is provided in the configuration. For dynamic
+	 * ratio this sets initial equal weights to all servers.
+	 */
 	if (!strcasecmp(tfw_cfg_sched->name, "ratio")) {
 		if (tfw_cfg_sg_ratio_verify(tfw_cfg_sg))
 			return -EINVAL;

From c1265986c5a7998af593d62dbceb614103174030 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 30 Mar 2017 12:12:16 +0300
Subject: [PATCH 08/37] Remove locks from tfw_sg_release_all().

The function is called in user context when it's guaranteed that
all activity has stopped. The locks do not allow the code that is
executed as part of tfw_sg_release_all() execution to run code
that may sleep, which should be allowed in used context.
---
 tempesta_fw/server.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c
index 1eb0f7ad5..85ff95643 100644
--- a/tempesta_fw/server.c
+++ b/tempesta_fw/server.c
@@ -246,25 +246,14 @@ tfw_sg_release_all(void)
 	TfwServer *srv, *srv_tmp;
 	TfwSrvGroup *sg, *sg_tmp;
 
-	write_lock(&sg_lock);
-
 	list_for_each_entry_safe(sg, sg_tmp, &sg_list, list) {
-		write_lock(&sg->lock);
-
 		list_for_each_entry_safe(srv, srv_tmp, &sg->srv_list, list)
 			tfw_server_destroy(srv);
-
-		write_unlock(&sg->lock);
-
 		if (sg->sched && sg->sched->del_grp)
 			sg->sched->del_grp(sg);
-
 		kfree(sg);
 	}
-
 	INIT_LIST_HEAD(&sg_list);
-
-	write_unlock(&sg_lock);
 }
 
 int __init

From 7fadba917bef8bd2e2d95377fbbd79ea3cff0f99 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 30 Mar 2017 16:21:39 +0300
Subject: [PATCH 09/37] Better sequence tracking for APM stats.

---
 tempesta_fw/apm.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index d9946a953..600834806 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -782,7 +782,8 @@ __tfw_apm_calc(TfwApmData *data, TfwPrcntlStats *pstats, int recalc)
 static void
 tfw_apm_calc(TfwApmData *data)
 {
-	int nfilled, wridx, recalc;
+	int nfilled, recalc;
+	unsigned int rdidx;
 	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
 	TfwPrcntlStats pstats = {
 		.ith = tfw_pstats_ith,
@@ -791,8 +792,8 @@ tfw_apm_calc(TfwApmData *data)
 	};
 	TfwApmSEnt *asent;
 
-	wridx = ((unsigned int)atomic_read(&data->stats.rdidx) + 1) % 2;
-	asent = &data->stats.asent[wridx];
+	rdidx = atomic_read(&data->stats.rdidx);
+	asent = &data->stats.asent[(rdidx + 1) % 2];
 
 	recalc = test_and_clear_bit(TFW_APM_DATA_F_RECALC, &data->flags);
 	nfilled = __tfw_apm_calc(data, &pstats, recalc);
@@ -839,15 +840,15 @@ tfw_apm_pstats_fn(unsigned long fndata)
  * tfw_apm_stats_bh() should be used for calls in user context.
  */
 #define __tfw_apm_stats_body(apmdata, pstats, fn_lock, fn_unlock)	\
-	int rdidx, seq = pstats->seq;					\
+	unsigned int rdidx, seq = pstats->seq;				\
 	TfwApmData *data = apmdata;					\
 	TfwApmSEnt *asent;						\
 									\
 	BUG_ON(!apmdata);						\
 									\
 	smp_mb__before_atomic();					\
-	rdidx = (unsigned int)atomic_read(&data->stats.rdidx) % 2;	\
-	asent = &data->stats.asent[rdidx];				\
+	rdidx = atomic_read(&data->stats.rdidx);			\
+	asent = &data->stats.asent[rdidx % 2];				\
 									\
 	fn_lock(&asent->rwlock);					\
 	memcpy(pstats->val, asent->pstats.val,				\
@@ -868,6 +869,7 @@ tfw_apm_stats(void *apmdata, TfwPrcntlStats *pstats)
 {
 	__tfw_apm_stats_body(apmdata, pstats, read_lock, read_unlock);
 }
+EXPORT_SYMBOL(tfw_apm_stats);
 
 /*
  * Verify that an APM Stats user using the same set of percentiles.

From 03b7a50443f5bc690f4d65e7556a9eb0c771c51c Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 30 Mar 2017 16:22:31 +0300
Subject: [PATCH 10/37] Get rid of duplicate EXPORT_SYMBOL(tfw_apm_stats) in
 unit tests.

---
 tempesta_fw/t/unit/test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tempesta_fw/t/unit/test.c b/tempesta_fw/t/unit/test.c
index cda0aa42b..3b01edfcc 100644
--- a/tempesta_fw/t/unit/test.c
+++ b/tempesta_fw/t/unit/test.c
@@ -22,6 +22,9 @@
 #include <linux/module.h>
 #include "test.h"
 
+#undef tfw_apm_stats
+#define tfw_apm_stats	test_tfw_apm_stats
+
 #include "apm.c"
 #include "vhost.c"
 

From 1f8266b65a579e452a29037ddc4f1f3bedc88c3b Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 31 Mar 2017 00:33:47 +0300
Subject: [PATCH 11/37] Full support of static or dynamic weights of different
 values.

Static weights, possibly different, are specified in the configuration
file. Dynamic weights are derived indirectly from RTT values provided
by APM module on periodic basis. The weights are converted to ratios
used by the ratio scheduler to distribute requests proportionally to
each server's weight.
---
 tempesta_fw/sched/tfw_sched_ratio.c | 770 +++++++++++++++++++++-------
 1 file changed, 574 insertions(+), 196 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 4411fdfd8..3474e2d32 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -19,8 +19,10 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sort.h>
 
 #include "tempesta_fw.h"
+#include "apm.h"
 #include "log.h"
 #include "server.h"
 
@@ -29,34 +31,38 @@ MODULE_DESCRIPTION("Tempesta Ratio Scheduler");
 MODULE_VERSION("0.3.0");
 MODULE_LICENSE("GPL");
 
+#define TFW_SCHED_RATIO_INTVL	(HZ / 20)	/* The timer periodicity. */
+
 /**
  * Individual upstream server descriptor.
  *
  * Connections may go up or down during failover process.
  * Only fully established connections are considered by scheduler.
  *
- * @conn_n	- number of connections to server.
  * @srv		- pointer to server structure.
  * @conns	- list of pointers to server connection structures.
  * @counter	- monotonic counter for choosing the next connection.
+ * @conn_n	- number of connections to server.
+ * @seq		- current sequence number for APM pstats.
  */
 typedef struct {
-	size_t		conn_n;
 	TfwServer	*srv;
 	TfwSrvConn	**conns;
 	atomic64_t	counter;
-} TfwRatioSrv;
+	size_t		conn_n;
+	unsigned int	seq;
+} TfwRatioSrvDesc;
 
 /**
  * Server data for scheduler.
  *
- * @sidx	- server id this data is for.
+ * @sdidx	- index of server descriptor this data is for.
  * @weight	- server weight.
  * @cratio	- current server ratio.
  * @oratio	- original server ratio.
  */
 typedef struct {
-	size_t		sidx;
+	size_t		sdidx;
 	unsigned int	weight;
 	unsigned int	cratio;
 	unsigned int	oratio;
@@ -66,32 +72,24 @@ typedef struct {
  * Scheduler iteration data.
  *
  * @lock	- must be in the same cache line for faster operations.
- * @csidx	- current server id.
- * @rearm       - next server id which ratio we need to re-arm, or @srv_n
- *		  if no re-arming is needed.
+ * @csidx	- index of current server data entry.
+ * @reidx       - index of next server data entry which ratio we need
+ *		  to reset, or @srv_n if no resetting is needed.
  * @riter	- ratio iteration, indicates the number of times we need
  *		  to choose all servers before the current one until we
  *		  can choose the current server.
  * @crsum	- current sum of all ratios, used to avoid scanning the
  *		  list of servers with fully zeroed ratios.
- * @orsum	- original sum of all ratios, used to re-arm @crsum.
+ * @orsum	- original sum of all ratios, used to reset @crsum.
  */
 typedef struct {
 	spinlock_t	lock;
 	size_t		csidx;
-	size_t		rearm;
+	size_t		reidx;
 	unsigned int	riter;
 	unsigned long	crsum;
 	unsigned long	orsum;
-} TfwRatioSchedData;
-
-/**
- * Scheduler data.
- */
-typedef struct {
-	TfwRatioSrvData		*srvdata;
-	TfwRatioSchedData	schdata;
-} TfwRatioSched;
+} TfwRatioSchData;
 
 /**
  * The main Ratio Scheduler structure.
@@ -99,16 +97,24 @@ typedef struct {
  * All servers, either dead or live, are present in the list during
  * the whole run-time. That may change in the future.
  *
+ * @rcu		- RCU control structure;
+ * @free	- indicates that the pool entry is available for use.
  * @srv_n	- number of upstream servers.
+ * @psidx	- APM pstats[] value index for dynamic ratios.
  * @sched	- scheduler data.
- * @srvs	- array of upstream server descriptors, shared between
+ * @srvdesc	- array of upstream server descriptors, shared between
  *		  RCU pool entries.
+ * @srvdata	- scheduler data specific to each server in the group.
+ * @schdata	- scheduler data common to all servers in the group.
  */
 typedef struct {
-	struct rcu_head	rcu;
-	size_t		srv_n;
-	TfwRatioSched	sched;
-	TfwRatioSrv	*srvs;
+	struct rcu_head		rcu;
+	atomic_t		free;
+	size_t			srv_n;
+	size_t			psidx;
+	TfwRatioSrvDesc		*srvdesc;
+	TfwRatioSrvData		*srvdata;
+	TfwRatioSchData		schdata;
 } TfwRatio;
 
 /**
@@ -116,246 +122,444 @@ typedef struct {
  *
  * @pool	- pool of TfwRatio{} for RCU.
  * @ratio	- pointer to the currently used structure.
+ * @rearm	- indicates if the timer can be re-armed.
+ * @timer	- periodic timer for dynamic APM data.
  */
 typedef struct {
-	TfwRatio	*rpool;
-	TfwRatio __rcu	*ratio;
+	TfwRatio		*rpool;
+	TfwRatio __rcu		*ratio;
+	atomic_t		rearm;
+	struct timer_list	timer;
 } TfwRatioPool;
 
 /**
- * Release Ratio Scheduler data from a server group.
+ * Swap two server data entries. Required for sorting by sort().
  */
 static void
-tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
+tfw_sched_ratio_srvdata_swap(void *lhs, void *rhs, int size)
 {
-	size_t i;
-	TfwRatio *ratio;
-	TfwRatioPool *rpool = sg->sched_data;
-
-	if (!rpool)
-		return;
-
-	/* Free the data that is shared in the pool. */
-	ratio = rpool->ratio;
-	for (i = 0; i < sg->srv_n; ++i)
-		if (ratio->srvs[i].conns)
-			kfree(ratio->srvs[i].conns);
-	kfree(ratio->srvs);
-
-	/* Free the data that is unique for each pool entry. */
-	for (i = 0, ratio = rpool->rpool; i <= nr_cpu_ids; ++i, ++ratio)
-		if (ratio->sched.srvdata)
-			kfree(ratio->sched.srvdata);
-
-	kfree(rpool);
-	sg->sched_data = NULL;
+	TfwRatioSrvData *lhs_data = (TfwRatioSrvData *)lhs;
+	TfwRatioSrvData *rhs_data = (TfwRatioSrvData *)rhs;
+	TfwRatioSrvData tmp = *lhs_data;
+	*lhs_data = *rhs_data;
+	*rhs_data = tmp;
 }
 
 /**
- * Delete a server group from Ratio Scheduler.
+ * Sort server data entries by ratio in descending order. Entries
+ * with higher ratios are moved towards the start of the array.
  */
-static void
-tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
+static int
+tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs)
 {
-	tfw_sched_ratio_cleanup(sg);
+	unsigned int lhs_ratio = ((const TfwRatioSrvData *)lhs)->oratio;
+	unsigned int rhs_ratio = ((const TfwRatioSrvData *)rhs)->oratio;
+
+	if (lhs_ratio > rhs_ratio)
+		return -1;
+	if (lhs_ratio < rhs_ratio)
+		return 1;
+	return 0;
 }
 
 /**
- * Set up initial or static ratios for all servers in the group.
+ * Calculate and set up ratios for each server in the group.
+ *
+ * Return 0 if done with the ratios.
+ * Return a non-zero value if additional actions are needed.
  */
-static void
-tfw_sched_ratio_set_static(TfwRatio *ratio)
+static int
+tfw_sched_ratio_calc(TfwRatio *ratio, unsigned int *arg_max_val_idx)
 {
-	size_t srv_i;
-	unsigned int diff = 0, wequal;
+	size_t si;
+	unsigned int diff, max_val_idx, max_wgt, oratio;
+	unsigned long unit, sum_wgt = 0, sum_ratio = 0;
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSchData *schdata = &ratio->schdata;
 
 	BUG_ON(!ratio);
-	wequal = ratio->srvs[0].srv->weight;
 
-	for (srv_i = 0; srv_i < ratio->srv_n; ++srv_i) {
-		unsigned int weight_i = ratio->srvs[srv_i].srv->weight;
-		ratio->sched.srvdata[srv_i].sidx = srv_i;
-		ratio->sched.srvdata[srv_i].weight = weight_i;
-		diff |= (wequal != weight_i);
+	/*
+	 * Calculate the sum of server's weights in the group. Remember
+	 * the index of server data entry with maximum weight. That same
+	 * entry will also have the maximum ratio. See if all weights in
+	 * the group are the same.
+	 */
+	diff = max_val_idx = 0;
+	for (si = 0; si < ratio->srv_n; ++si) {
+		if (srvdata[max_val_idx].weight < srvdata[si].weight)
+			max_val_idx = si;
+		sum_wgt += srvdata[si].weight;
+		diff |= (srvdata[si].weight != srvdata[0].weight);
 	}
+
+	/* Set up the common part of scheduler data. */
+	schdata->csidx = 0;
+	schdata->riter = 1;
+	schdata->reidx = ratio->srv_n;
+
+	/*
+	 * If all server weights are the same, then there's no need to do
+	 * anything else. Set up all ratios to 1 and be done with it.
+	 */
 	if (!diff) {
-		for (srv_i = 0; srv_i < ratio->srv_n; ++srv_i) {
-			ratio->sched.srvdata[srv_i].cratio =
-			ratio->sched.srvdata[srv_i].oratio = 1;
-		}
-		ratio->sched.schdata.csidx = 0;
-		ratio->sched.schdata.riter = 1;
-		ratio->sched.schdata.rearm = ratio->srv_n;
-		ratio->sched.schdata.crsum =
-		ratio->sched.schdata.orsum = ratio->srv_n;
-		return;
+		for (si = 0; si < ratio->srv_n; ++si)
+			srvdata[si].cratio = srvdata[si].oratio = 1;
+		schdata->crsum = schdata->orsum = ratio->srv_n;
+		return 0;
 	}
-	printk(KERN_ERR "%s: Different weights are not supported yet.\n",
-			__func__);
+
+	/*
+	 * Calculate each server's ratio using a special formula. See
+	 * if all calculated ratios are the same. Set up scheduler data.
+	 */
+	max_wgt = srvdata[max_val_idx].weight;
+	unit = ((max_wgt + ratio->srv_n) * max_wgt) / sum_wgt;
+	for (si = 0; si < ratio->srv_n; ++si) {
+		oratio = (unit * srvdata[si].weight) / max_wgt ? : 1;
+		srvdata[si].cratio = srvdata[si].oratio = oratio;
+		diff |= (oratio != srvdata[0].oratio);
+		sum_ratio += oratio;
+	}
+	schdata->crsum = schdata->orsum = sum_ratio;
+
+	/* Return the index of server data entry with maximum ratio. */
+	*arg_max_val_idx = max_val_idx;
+
+	return diff;
+}
+
+/*
+ * Calculate and set up ratios for each server in a group based on
+ * weights that are statically defined in the configuration file.
+ */
+static void
+tfw_sched_ratio_calc_static(TfwRatio *ratio)
+{
+	size_t si;
+	unsigned int max_val_idx = 0;
+	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+
+	/* Collect server weights from the configuration. */
+	for (si = 0; si < ratio->srv_n; ++si) {
+		srvdata[si].sdidx = si;
+		srvdata[si].weight = srvdesc[si].srv->weight;
+	}
+
+	/* Calculate ratios based on server weights. */
+	if (!tfw_sched_ratio_calc(ratio, &max_val_idx))
+		return;
+
+	/* Sort server data entries by ratio in descending order. */
+	sort(srvdata, ratio->srv_n, sizeof(srvdata[0]),
+	     tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap);
 }
 
 /**
- * Add a server group to Ratio Scheduler.
+ * Calculate ratios for each server in a group based on dynamic data.
+ * the function runs periodically on timer and provides the data that
+ * is used by the ratio scheduler for outgoing requests.
  *
- * At the time this function is called the server group is fully formed
- * and populated with all servers and connections.
+ * Latest dynamic data is provided by APM module and represent RTT values
+ * for each server in a group. Ratios are calculated on those RTT values.
+ * However that way the ratios do not represent the real weight of each
+ * server because a bigger RTT value mean that a server is less favorable
+ * and has a lesser, NOT bigger weight.
+ *
+ * Based on ratios calculated from RTT values, the algorithm here assigns
+ * a correct ratio to each server in the group.
+ * 1. If the minimal ratio is 1, then fill the entries with minimal ratio
+ *    with values from an entry with the maximum ratio. Fill the entries
+ *    with maximum ratio with values from an entry with minimal ratio.
+ * 2. Sort the resulting array by ratio in descending order as required
+ *    by the scheduling algorithm.
+ * 3. Select the part of the array that omits entries from step 1 if any.
+ *    Those are entries at the start and at the end of the array. Reverse
+ *    the sequence of server descriptor indices in that part of the array.
+ *    The resulting pairing of servers to ratios is the target.
+ *
+ * Return 0 if there are no new ratio values.
+ * Return a non-zero value if new ratio values were calculated.
  */
 static int
-tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
+tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 {
-	int ret = -ENOMEM;
-	size_t size, srv_i;
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-	TfwRatioPool *rpool;
-	TfwRatio *ratio;
-	TfwRatioSrv *rsrv;
+	size_t si, left = 0, right = 0;
+	unsigned int recalc = 0, max_ratio = 0;
+	unsigned int has_one_val = 0, max_val_idx = 0;
+	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
+	TfwPrcntlStats pstats = {
+		.ith = tfw_pstats_ith,
+		.val = val,
+		.psz = ARRAY_SIZE(tfw_pstats_ith)
+	};
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
 
 	/*
-	 * Validate the number of servers in the group, and the number
-	 * of connections for each server.
+	 * Collect server RTT values from APM module. See if APM may have
+	 * provided new data, and a recalculation is required. Otherwise
+	 * there's nothing to do.
+	 *
+	 * TODO: The following cases should be considered.
+	 * 1. APM recalculates the stats on each request-response pair.
+	 *    It's quite possible that the actual stats values did not
+	 *    change. However, the APM doesn't know of that and reports
+	 *    that the values may have changed. It would be great to
+	 *    catch that and avoid the recalculation of ratios.
+	 * 2. Depending on actual RTT values a small deviation from the
+	 *    previous value should be acceptable. It should not cause
+	 *    a recalculation of ratio.
+	 * 3. Finally, a typical case is that only a handful of servers
+	 *    misbehave in a large group of servers. Is there a way to
+	 *    detect that and do a partial recalculation of ratios?
 	 */
-	srv_i = 0;
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		size_t conn_i = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			++conn_i;
-		if (conn_i > srv->conn_n)
-			return -EINVAL;
-		++srv_i;
+	for (si = 0; si < ratio->srv_n; ++si) {
+		pstats.seq = srvdesc[si].seq;
+		recalc |= tfw_apm_stats(srvdesc[si].srv->apm, &pstats);
+		srvdesc[si].seq = pstats.seq;
+
+		srvdata[si].sdidx = si;
+		srvdata[si].weight = pstats.val[ratio->psidx] ? : 1;
 	}
-	if (srv_i > sg->srv_n)
-		return -EINVAL;
+	if (!recalc)
+		return 0;
 
-	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
-	size = sizeof(TfwRatioPool) + sizeof(TfwRatio) * (nr_cpu_ids + 1);
-	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
-		return -ENOMEM;
-	rpool = sg->sched_data;
-	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
-	rpool->ratio = rpool->rpool;
-	ratio = rpool->ratio;
+	/* Calculate ratios based on server RTT values. */
+	if (!tfw_sched_ratio_calc(ratio, &max_val_idx))
+		return 1;
 
-	/* Array to hold server descriptors. */
-	size = sizeof(TfwRatioSrv) * sg->srv_n;
-	if (!(ratio->srvs = kzalloc(size, GFP_KERNEL)))
-		goto cleanup;
+	/*
+	 * It's guaranteed here that NOT all calculated ratio values are
+	 * equal. See if there are ratio values that equal to 1. If so,
+	 * do actions described in step 1 in the function's description.
+	 */
+	for (si = 0; si < ratio->srv_n; ++si) {
+		if (srvdata[si].oratio == 1) {
+			has_one_val = 1;
+			break;
+		}
+	}
+	if (has_one_val) {
+		TfwRatioSrvData sdent_one = srvdata[si];
+		TfwRatioSrvData sdent_max = srvdata[max_val_idx];
+
+		/* Save maximum ratio value for future use. */
+		max_ratio = srvdata[max_val_idx].oratio;
+
+		for (si = 0; si < ratio->srv_n; ++si) {
+			if (srvdata[si].oratio == 1) {
+				srvdata[si].weight = sdent_max.weight;
+				srvdata[si].oratio =
+				srvdata[si].cratio = sdent_max.oratio;
+			} else if (srvdata[si].oratio == sdent_max.oratio) {
+				srvdata[si].weight = sdent_one.weight;
+				srvdata[si].oratio =
+				srvdata[si].cratio = sdent_one.oratio;
+			}
+		}
+	}
 
-	/* Array to hold server data for scheduler. */
-	size = sizeof(TfwRatioSrvData) * sg->srv_n;
-	if (!(ratio->sched.srvdata = kzalloc(size, GFP_KERNEL)))
-		goto cleanup;
-	spin_lock_init(&ratio->sched.schdata.lock);
+	/* Sort server data entries by ratio in descending order. */
+	sort(srvdata, ratio->srv_n, sizeof(srvdata[0]),
+	     tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap);
 
-	/* Initial setup of upstream server descriptors. */
-	srv_i = 0;
-	rsrv = ratio->srvs;
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		size_t conn_i = 0;
-		size = sizeof(TfwSrvConn *) * srv->conn_n;
-		if (!(rsrv->conns = kzalloc(size, GFP_KERNEL)))
-			goto cleanup;
-		rsrv->srv = srv;
-		rsrv->conn_n = srv->conn_n;
-		atomic64_set(&rsrv->counter, 0);
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			rsrv->conns[conn_i++] = srv_conn;
-		++rsrv;
-		++srv_i;
+	/*
+	 * Do actions described in step 3 in the function's description.
+	 * Select the part of the array that omits entries from step 1
+	 * if there are any. Those are entries at the start and at the
+	 * end of the array. Reverse the sequence of server descriptor
+	 * indices in that part of the array.
+	 */
+	if (has_one_val) {
+		left = 0;
+		right = ratio->srv_n - 1;
+	} else {
+		for (si = 0; si < ratio->srv_n; ++si)
+			if (srvdata[si].oratio == max_ratio) {
+				left = si + 1;
+			} else if (srvdata[si].oratio == 1) {
+				right = si - 1;
+				break;
+			}
+	}
+	while (left < right) {
+		size_t left_sdidx = srvdata[left].sdidx;
+		srvdata[left++].sdidx = srvdata[right].sdidx;
+		srvdata[right--].sdidx = left_sdidx;
 	}
-	ratio->srv_n = sg->srv_n;
 
-	/* Set up the initial ratio data. */
-	if (!(sg->flags & (TFW_SG_F_SCHED_RATIO_STATIC
-			   | TFW_SG_F_SCHED_RATIO_DYNAMIC)))
-		BUG();
-	if (sg->flags & TFW_SG_F_SCHED_RATIO_STATIC)
-		printk(KERN_ERR "ratio static.\n");
-	else if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC)
-		printk(KERN_ERR "ratio dynamic: %d\n",
-				sg->flags & TFW_SG_F_PSTATS_IDX_MASK);
-	tfw_sched_ratio_set_static(ratio);
+	return 1;
+}
 
-	return 0;
+/*
+ *  * Get a free for use entry from the RCU pool.
+ *   */
+static TfwRatio *
+tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
+{
+	int si;
+	TfwRatio *ratio = rpool->rpool;
+
+	for (si = 0; si <= nr_cpu_ids; ++si, ++ratio) {
+		smp_mb__before_atomic();
+		if (atomic_read(&ratio->free)) {
+			atomic_set(&ratio->free, 0);
+			return ratio;
+		}
+	}
 
-cleanup:
-	tfw_sched_ratio_cleanup(sg);
-	return ret;
+        return NULL;
+}
+
+/*
+ * Return an entry to the RCU pool.
+ */
+static inline void
+__tfw_sched_ratio_rpool_put(TfwRatio *ratio)
+{
+        atomic_set(&ratio->free, 1);
+        smp_mb__after_atomic();
+}
+
+static void
+tfw_sched_ratio_rpool_put(struct rcu_head *rcup)
+{
+        TfwRatio *ratio = container_of(rcup, TfwRatio, rcu);
+	__tfw_sched_ratio_rpool_put(ratio);
 }
 
 /**
- * Add a connection and a server, if new, to the scheduler.
- * Called at configuration stage, no synchronization is required.
+ * Calculate the latest ratios for each server in the group in real time.
  *
- * The whole server and server connections data for a group is complete
- * at the time the group is added to the scheduler with add_grp(). Thus
- * the actual role of the function is to make cure that data is the same.
- * The logic is based on the assumption that servers and connections are
- * submitted in the same order as they were when add_grp() was called.
+ * RCU is used to avoid locks. When recalculation is in order, the new
+ * data is placed in an available entry from the RCU pool. The new entry
+ * then is seamlessly set as the current entry. The formerly active entry
+ * is returned to the RCU pool when all users of it are done and gone.
+ *
+ * It may happen that no RCU pool entry is available at the moment.
+ * That's not a big deal. Scheduling of upstream servers will continue
+ * to run on currently active data. The timer is scheduled to run ASAP
+ * and catch an RCU pool entry the moment it gets available.
+ * To make this case less probable, the number of RCU pool entries
+ * is chosen as one more than the number of CPU slots in the system.
  */
 static void
-tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
+tfw_sched_ratio_tmfn(unsigned long tmfn_data)
 {
-	static size_t srv_i = 0, conn_i = 0;
+	TfwSrvGroup *sg = (TfwSrvGroup *)tmfn_data;
 	TfwRatioPool *rpool = sg->sched_data;
-	TfwRatio *ratio;
-	TfwRatioSrv *rsrv;
-	TfwSrvConn *rconn;
+	TfwRatio *cratio, *nratio;
+	int interval = TFW_SCHED_RATIO_INTVL;
 
-	BUG_ON(!rpool);
-	ratio = rpool->ratio;
-
-	/* Make sure that data is the same. */
-	rsrv = ratio->srvs + srv_i;
-	BUG_ON(rsrv->srv != srv);
-
-	rconn = rsrv->conns[conn_i];
-	BUG_ON(rconn != srv_conn);
+	/*
+	 * Get an available ratio entry from the RCU pool. If there's
+	 * none at the moment, then try it again in a short while on
+	 * the next run of timer function.
+	 */
+	nratio = tfw_sched_ratio_rpool_get(rpool);
+	if (unlikely(!nratio)) {
+		interval = 1;
+		goto rearm;
+	}
 
-	if (++conn_i == srv->conn_n) {
-		conn_i = 0;
-		if (++srv_i == sg->srv_n)
-			srv_i = 0;
+	/*
+	 * Calculate dynamic ratios. If there's nothing to do, then
+	 * return the ratio entry back to the RCU pool.
+	 */
+	if (!tfw_sched_ratio_calc_dynamic(nratio)) {
+		__tfw_sched_ratio_rpool_put(nratio);
+		goto rearm;
 	}
+
+	/*
+	 * Substitute the current ratio entry with the new one for
+	 * scheduler. The former entry will be returned to the RCU
+	 * pool when there are no users of it.
+	 */
+	cratio = rpool->ratio;
+	rcu_assign_pointer(rpool->ratio, nratio);
+	call_rcu(&cratio->rcu, tfw_sched_ratio_rpool_put);
+
+rearm:
+	smp_mb__before_atomic();
+	if (atomic_read(&rpool->rearm))
+		mod_timer(&rpool->timer, jiffies + interval);
 }
 
+/*
+ * Determine if it's the turn of the server described by the server
+ * data entry at index @csidx.
+ *
+ * It's the turn of server at @csidx if sums of ratios to the left and
+ * to the right of this entry are proportional to the current iteration.
+ * As the scheduler algorithm moves forward, the sum of ratios on the
+ * left side decreases. When a server is selected, its current ratio
+ * is decremented, so the sum of ratios decreases by 1 as well.
+ *
+ * With that in mind, ratios that have a huge difference should not be
+ * specified for servers in the same group. A decrement of a huge sum
+ * would be too insignificant to affect the scheduling algorithm. Thus
+ * weights like { 10, 1 } make more sense than weights like { 1000, 10 }.
+ * Requests are distributed proportionally in both cases, but significant
+ * bursts are possible in the first case.
+ *
+ * TODO: The algorithm may and should be improved.
+ */
 static inline bool
 tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, size_t csidx)
 {
 	unsigned int headsum2, tailsum2;
-	TfwRatioSrvData *srvdata = ratio->sched.srvdata;
-	TfwRatioSchedData *schdata = &ratio->sched.schdata;
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSchData *schdata = &ratio->schdata;
 
 	if (!csidx)
 		return true;
+
 	headsum2 = (srvdata[0].cratio + srvdata[csidx - 1].cratio) * csidx;
 	tailsum2 = (srvdata[csidx].cratio
 		    + (srvdata[ratio->srv_n - 1].cratio
-		       ? : srvdata[ratio->srv_n - 1].cratio))
+		       ? : srvdata[ratio->srv_n - 1].oratio))
 		   * (ratio->srv_n - csidx);
+
 	return tailsum2 * schdata->riter > headsum2;
 }
 
 /*
- * Get the index of the next server
+ * Get the index of the next server descriptor.
+ *
+ * The array of server data entries used by the algorithm must be sorted
+ * by ratio in descending order, with the higher weight entries moved
+ * towards the start of the array.
  *
- * The function is synchronized by a plain spin lock. A lock-free
- * implementation of the algorithm as it is would require too many
- * atomic operations including CMPXCHG and checking loops, so it seems
- * we won't win anything.
+ * For concurrent use the algorithm is synchronized by a plain spin lock.
+ * A lock-free implementation of the algorithm as it is would require too
+ * many atomic operations including CMPXCHG and checking loops. It seems
+ * that it won't give any advantage.
  */
 static size_t
 tfw_sched_ratio_next_srv(TfwRatio *ratio)
 {
 	size_t csidx;
-	TfwRatioSrvData *srvdata = ratio->sched.srvdata;
-	TfwRatioSchedData *schdata = &ratio->sched.schdata;
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSchData *schdata = &ratio->schdata;
 
+	/* Start with server that has the highest ratio. */
 	spin_lock(&schdata->lock);
 retry:
 	csidx = schdata->csidx;
 	if (!srvdata[csidx].cratio) {
-		if (schdata->rearm != csidx) {
+		/*
+		 * The server's counter (current ratio) is depleted, but
+		 * the server is not due yet for re-arming. Don't choose
+		 * this server. This is a likely branch for ratios like
+		 * { N, 1, 1, 1, ... } where N > 1 at some point. This
+		 * is not the case if all server weights (and therefore
+		 * ratios) were specified as 1. In that case it's down
+		 * to plain round-robin.
+		 */
+		if (schdata->reidx != csidx) {
 			++schdata->csidx;
 			if (schdata->csidx == ratio->srv_n) {
 				schdata->csidx = 0;
@@ -364,16 +568,17 @@ tfw_sched_ratio_next_srv(TfwRatio *ratio)
 			goto retry;
 		}
 		srvdata[csidx].cratio = srvdata[csidx].oratio;
-		++schdata->rearm;
+		++schdata->reidx;
+		/* Fall through */
 	}
 	/*
 	 * If it's the turn of the current server then take off a point
 	 * from the server's current ratio (decrement it). Then prepare
 	 * for the next time this function is called. If ratios of all
-	 * servers got down to zero, then rearm everything and start
+	 * servers got down to zero, then reset everything and start
 	 * from the beginning. Otherwise, if it's the last server in
 	 * the group, then also start from the beginning, but do not
-	 * re-arm as it's been re-armed already (make sure of that).
+	 * reset as it's been reset already (make sure of that).
 	 */
 	if (likely(tfw_sched_ratio_is_srv_turn(ratio, csidx))) {
 		--srvdata[csidx].cratio;
@@ -381,14 +586,14 @@ tfw_sched_ratio_next_srv(TfwRatio *ratio)
 			schdata->csidx = 0;
 			schdata->riter = 1;
 			schdata->crsum = schdata->orsum;
-			schdata->rearm = 0;
+			schdata->reidx = 0;
 		} else if (unlikely(++schdata->csidx == ratio->srv_n)) {
-			BUG_ON(schdata->rearm != ratio->srv_n);
+			BUG_ON(schdata->reidx != ratio->srv_n);
 			schdata->csidx = 0;
 			schdata->riter = 1;
 		}
 		spin_unlock(&schdata->lock);
-		return csidx;
+		return srvdata[csidx].sdidx;
 	}
 	/*
 	 * This is not the turn of the current server. Start
@@ -427,10 +632,9 @@ tfw_sched_ratio_sched_srv(TfwMsg *msg, TfwSrvGroup *sg)
 	size_t csidx;
 	TfwRatioPool *rpool = sg->sched_data;
 	TfwRatio *ratio;
-	TfwRatioSrv *rsrv;
+	TfwRatioSrvDesc *srvdesc;
 	TfwSrvConn *srv_conn;
 
-	printk(KERN_ERR "%s scheduler called.\n", sg->sched->name);
 	BUG_ON(!rpool);
 
 	rcu_read_lock();
@@ -438,13 +642,10 @@ tfw_sched_ratio_sched_srv(TfwMsg *msg, TfwSrvGroup *sg)
 	BUG_ON(!ratio);
 
 	csidx = tfw_sched_ratio_next_srv(ratio);
-	rsrv = &ratio->srvs[csidx];
-	idxval = atomic64_inc_return(&rsrv->counter);
-	srv_conn = rsrv->conns[idxval % rsrv->conn_n];
+	srvdesc = &ratio->srvdesc[csidx];
+	idxval = atomic64_inc_return(&srvdesc->counter);
+	srv_conn = srvdesc->conns[idxval % srvdesc->conn_n];
 	if (tfw_srv_conn_get_if_live(srv_conn)) {
-		printk(KERN_ERR "%s: sched srv=[%zd] conn=[%zd]\n",
-				__func__, csidx,
-				(size_t)(idxval % rsrv->conn_n));
 		rcu_read_unlock();
 		return(srv_conn);
 	}
@@ -453,6 +654,183 @@ tfw_sched_ratio_sched_srv(TfwMsg *msg, TfwSrvGroup *sg)
 	return NULL;
 }
 
+/**
+ * Release Ratio Scheduler data from a server group.
+ */
+static void
+tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
+{
+	size_t si;
+	TfwRatio *ratio;
+	TfwRatioPool *rpool = sg->sched_data;
+
+	if (!rpool)
+		return;
+
+	/* Free the data that is shared between pool entries. */
+	ratio = rpool->rpool;
+	for (si = 0; si < sg->srv_n; ++si)
+		if (ratio->srvdesc[si].conns)
+			kfree(ratio->srvdesc[si].conns);
+	kfree(ratio->srvdesc);
+
+	/* Free the data that is unique for each pool entry. */
+	for (si = 0, ratio = rpool->rpool; si <= nr_cpu_ids; ++si, ++ratio)
+		if (ratio->srvdata)
+			kfree(ratio->srvdata);
+
+	kfree(rpool);
+	sg->sched_data = NULL;
+}
+
+/**
+ * Delete a server group from Ratio Scheduler.
+ */
+static void
+tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
+{
+	TfwRatioPool *rpool = sg->sched_data;
+
+	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
+		atomic_set(&rpool->rearm, 0);
+		smp_mb__after_atomic();
+		del_timer_sync(&rpool->timer);
+	}
+	synchronize_rcu();
+	tfw_sched_ratio_cleanup(sg);
+}
+
+/**
+ * Add a server group to Ratio Scheduler.
+ *
+ * At the time this function is called the server group is fully formed
+ * and populated with all servers and connections.
+ */
+static int
+tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
+{
+	int ret = -ENOMEM;
+	size_t size, si, ci;
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+	TfwRatio *ratio;
+	TfwRatioPool *rpool;
+	TfwRatioSrvDesc *trsdesc, *srvdesc;
+
+	/*
+	 * Validate the number of servers in the group, and the number
+	 * of connections for each server.
+	 */
+	si = 0;
+	list_for_each_entry(srv, &sg->srv_list, list) {
+		ci = 0;
+		list_for_each_entry(srv_conn, &srv->conn_list, list)
+			++ci;
+		if (ci > srv->conn_n)
+			return -EINVAL;
+		++si;
+	}
+	if (si > sg->srv_n)
+		return -EINVAL;
+
+	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
+	size = sizeof(TfwRatioPool) + sizeof(TfwRatio) * (nr_cpu_ids + 1);
+	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
+		return -ENOMEM;
+	rpool = sg->sched_data;
+	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
+	rpool->ratio = rpool->rpool;
+
+	/* Array for server descriptors. Shared between RCU pool entries. */
+	size = sizeof(TfwRatioSrvDesc) * sg->srv_n;
+	if (!(trsdesc = kzalloc(size, GFP_KERNEL)))
+		goto cleanup;
+	rpool->rpool[0].srvdesc = trsdesc;
+
+	/* Set up each RCU pool entry with required arrays and data. */
+	size = sizeof(TfwRatioSrvData) * sg->srv_n;
+	for (si = 0, ratio = rpool->rpool; si <= nr_cpu_ids; ++si, ++ratio) {
+		if (!(ratio->srvdata = kzalloc(size, GFP_KERNEL)))
+			goto cleanup;
+		spin_lock_init(&ratio->schdata.lock);
+		ratio->srvdesc = trsdesc;
+		ratio->srv_n = sg->srv_n;
+		ratio->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
+		atomic_set(&ratio->free, 1);
+	}
+
+	/* Initial setup of upstream server descriptors. */
+	srvdesc = trsdesc;
+	list_for_each_entry(srv, &sg->srv_list, list) {
+		size = sizeof(TfwSrvConn *) * srv->conn_n;
+		if (!(srvdesc->conns = kzalloc(size, GFP_KERNEL)))
+			goto cleanup;
+		ci = 0;
+		list_for_each_entry(srv_conn, &srv->conn_list, list)
+			srvdesc->conns[ci++] = srv_conn;
+		srvdesc->conn_n = srv->conn_n;
+		srvdesc->srv = srv;
+		atomic64_set(&srvdesc->counter, 0);
+		++srvdesc;
+	}
+
+	/* Set up the initial ratio data. */
+	if (!(sg->flags & (TFW_SG_F_SCHED_RATIO_STATIC
+			   | TFW_SG_F_SCHED_RATIO_DYNAMIC)))
+		BUG();
+
+	tfw_sched_ratio_calc_static(rpool->ratio);
+
+	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
+		atomic_set(&rpool->rearm, 1);
+		setup_timer(&rpool->timer,
+			    tfw_sched_ratio_tmfn, (unsigned long)sg);
+		mod_timer(&rpool->timer, jiffies + TFW_SCHED_RATIO_INTVL);
+	}
+
+	return 0;
+
+cleanup:
+	tfw_sched_ratio_cleanup(sg);
+	return ret;
+}
+
+/**
+ * Add a connection and a server, if new, to the scheduler.
+ * Called at configuration stage, no synchronization is required.
+ *
+ * The whole server and server connections data for a group is complete
+ * at the time the group is added to the scheduler with add_grp(). Thus
+ * the actual role of the function is to make cure that data is the same.
+ * The logic is based on the assumption that servers and connections are
+ * submitted in the same order as they were when add_grp() was called.
+ */
+static void
+tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
+{
+	static size_t si = 0, ci = 0;
+	TfwRatioPool *rpool = sg->sched_data;
+	TfwRatio *ratio;
+	TfwRatioSrvDesc *srvdesc;
+	TfwSrvConn *rconn;
+
+	BUG_ON(!rpool);
+	ratio = rpool->ratio;
+
+	/* Make sure that data is the same. */
+	srvdesc = ratio->srvdesc + si;
+	BUG_ON(srvdesc->srv != srv);
+
+	rconn = srvdesc->conns[ci];
+	BUG_ON(rconn != srv_conn);
+
+	if (++ci == srv->conn_n) {
+		ci = 0;
+		if (++si == sg->srv_n)
+			si = 0;
+	}
+}
+
 static TfwScheduler tfw_sched_ratio = {
 	.name		= "ratio",
 	.list		= LIST_HEAD_INIT(tfw_sched_ratio.list),

From f204ebb0ff572454aca2583ffdfbfb7d702d53c0 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 4 Apr 2017 00:49:56 +0300
Subject: [PATCH 12/37] Bugfix: Adjust the sum of ratios after min and max
 ratios are exchanged.

---
 tempesta_fw/sched/Makefile          |  1 +
 tempesta_fw/sched/tfw_sched_ratio.c | 27 ++++++++++++++++-----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/tempesta_fw/sched/Makefile b/tempesta_fw/sched/Makefile
index 3d75df7e5..660bda481 100644
--- a/tempesta_fw/sched/Makefile
+++ b/tempesta_fw/sched/Makefile
@@ -21,3 +21,4 @@ EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/../ -I$(src)/../../tempesta_db/core
 EXTRA_CFLAGS += $(TTLS_CFLAGS)
 
 obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_ratio.o tfw_sched_rr.o
+### obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_rr.o
diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 5c53bcd12..2da876e60 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -334,6 +334,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 	 * It's guaranteed here that NOT all calculated ratio values are
 	 * equal. See if there are ratio values that equal to 1. If so,
 	 * do actions described in step 1 in the function's description.
+	 * Adjust the sum of ratios that is changed in this procedure.
 	 */
 	for (si = 0; si < ratio->srv_n; ++si) {
 		if (srvdata[si].oratio == 1) {
@@ -342,6 +343,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 		}
 	}
 	if (has_one_val) {
+		unsigned int orsum = ratio->schdata.orsum;
 		TfwRatioSrvData sdent_one = srvdata[si];
 		TfwRatioSrvData sdent_max = srvdata[max_val_idx];
 
@@ -353,12 +355,15 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 				srvdata[si].weight = sdent_max.weight;
 				srvdata[si].oratio =
 				srvdata[si].cratio = sdent_max.oratio;
+				orsum += sdent_max.oratio - 1;
 			} else if (srvdata[si].oratio == sdent_max.oratio) {
 				srvdata[si].weight = sdent_one.weight;
 				srvdata[si].oratio =
 				srvdata[si].cratio = sdent_one.oratio;
+				orsum -= sdent_max.oratio - 1;
 			}
 		}
+		ratio->schdata.crsum = ratio->schdata.orsum = orsum;
 	}
 
 	/* Sort server data entries by ratio in descending order. */
@@ -743,19 +748,16 @@ tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
 
 	/* Free the data that is shared between pool entries. */
 	ratio = rpool->rpool;
-	for (si = 0; si < sg->srv_n; ++si) {
-		TfwRatioSrvDesc *srvdesc = &ratio->srvdesc[si];
-		if (srvdesc->conns)
-			kfree(srvdesc->conns);
-		if (srvdesc->srv)
-			srvdesc->srv->sched_data = NULL;
-	}
+	for (si = 0; si < sg->srv_n; ++si)
+		if (ratio->srvdesc[si].conns)
+			kfree(ratio->srvdesc[si].conns);
 	kfree(ratio->srvdesc);
 
 	/* Free the data that is unique for each pool entry. */
-	for (si = 0, ratio = rpool->rpool; si <= nr_cpu_ids; ++si, ++ratio)
-		if (ratio->srvdata)
-			kfree(ratio->srvdata);
+	ratio = rpool->rpool;
+	for (si = 0; si <= nr_cpu_ids; ++si)
+		if (ratio[si].srvdata)
+			kfree(ratio[si].srvdata);
 
 	kfree(rpool);
 	sg->sched_data = NULL;
@@ -853,7 +855,10 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 		++srvdesc;
 	}
 
-	/* Set up the initial ratio data. */
+	/*
+	 * Set up the initial ratio data. For dynamic ratios it's all
+	 * equal initial weights.
+	 */
 	if (!(sg->flags & (TFW_SG_F_SCHED_RATIO_STATIC
 			   | TFW_SG_F_SCHED_RATIO_DYNAMIC)))
 		BUG();

From b00020972c449d8605b49c7ca1d6104dce1f9f55 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 4 Apr 2017 13:28:50 +0300
Subject: [PATCH 13/37] Unit tests for Ratio Scheduler based on unit tests for
 RR scheduler.

---
 tempesta_fw/t/unit/Makefile           |   1 +
 tempesta_fw/t/unit/sched_helper.h     |   1 +
 tempesta_fw/t/unit/test.c             |   2 +
 tempesta_fw/t/unit/test_sched_ratio.c | 350 ++++++++++++++++++++++++++
 4 files changed, 354 insertions(+)
 create mode 100644 tempesta_fw/t/unit/test_sched_ratio.c

diff --git a/tempesta_fw/t/unit/Makefile b/tempesta_fw/t/unit/Makefile
index 7dee41b58..4fd70dd41 100644
--- a/tempesta_fw/t/unit/Makefile
+++ b/tempesta_fw/t/unit/Makefile
@@ -36,6 +36,7 @@ tfw_test-objs = \
 	test_tfw_str.o \
 	test_http_parser.o \
 	sched_helper.o \
+	test_sched_ratio.o \
 	test_sched_rr.o \
 	test_sched_hash.o \
 	test_sched_http.o \
diff --git a/tempesta_fw/t/unit/sched_helper.h b/tempesta_fw/t/unit/sched_helper.h
index 0888d183a..b8a3c3a6f 100644
--- a/tempesta_fw/t/unit/sched_helper.h
+++ b/tempesta_fw/t/unit/sched_helper.h
@@ -33,6 +33,7 @@
 
 int tfw_server_init(void);
 int tfw_sched_rr_init(void);
+int tfw_sched_ratio_init(void);
 void sched_helper_init(void);
 
 void test_spec_cleanup(TfwCfgSpec specs[]);
diff --git a/tempesta_fw/t/unit/test.c b/tempesta_fw/t/unit/test.c
index 3b01edfcc..c6a31cd7c 100644
--- a/tempesta_fw/t/unit/test.c
+++ b/tempesta_fw/t/unit/test.c
@@ -95,6 +95,7 @@ TEST_SUITE(http_sticky);
 TEST_SUITE(http_match);
 TEST_SUITE(hash);
 TEST_SUITE(addr);
+TEST_SUITE(sched_ratio);
 TEST_SUITE(sched_rr);
 TEST_SUITE(sched_hash);
 TEST_SUITE(sched_http);
@@ -120,6 +121,7 @@ test_run_all(void)
 	TEST_SUITE_RUN(http_sticky);
 	TEST_SUITE_RUN(hash);
 	TEST_SUITE_RUN(addr);
+	TEST_SUITE_RUN(sched_ratio);
 	TEST_SUITE_RUN(sched_rr);
 	TEST_SUITE_RUN(sched_hash);
 	TEST_SUITE_RUN(sched_http);
diff --git a/tempesta_fw/t/unit/test_sched_ratio.c b/tempesta_fw/t/unit/test_sched_ratio.c
new file mode 100644
index 000000000..14dfd5d84
--- /dev/null
+++ b/tempesta_fw/t/unit/test_sched_ratio.c
@@ -0,0 +1,350 @@
+/**
+ *		Tempesta FW
+ *
+ * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
+ * Copyright (C) 2015-2017 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <asm/fpu/api.h>
+
+#undef tfw_sock_srv_init
+#define tfw_sock_srv_init test_ratio_sock_srv_conn_init
+#undef tfw_sock_srv_exit
+#define tfw_sock_srv_exit test_ratio_sock_srv_exit
+#undef tfw_srv_conn_release
+#define tfw_srv_conn_release test_ratio_srv_conn_release
+#undef tfw_sock_srv_cfg_mod
+#define tfw_sock_srv_cfg_mod test_ratio_srv_cfg_mod
+
+#include "sock_srv.c"
+
+#ifdef module_init
+#undef module_init
+#undef module_exit
+#define module_init(func)
+#define module_exit(func)
+#endif
+
+#include "../../sched/tfw_sched_ratio.c"
+
+#include "sched_helper.h"
+#include "server.h"
+#include "test.h"
+
+static TfwMsg *
+sched_ratio_get_arg(size_t conn_type __attribute__((unused)))
+{
+	return NULL;
+}
+
+static void
+sched_ratio_free_arg(TfwMsg *msg __attribute__((unused)))
+{
+}
+
+static struct TestSchedHelper sched_helper_ratio = {
+	.sched = "ratio",
+	.flags = TFW_SG_F_SCHED_RATIO_STATIC,
+	.conn_types = 1,
+	.get_sched_arg = &sched_ratio_get_arg,
+	.free_sched_arg = &sched_ratio_free_arg,
+};
+
+TEST(tfw_sched_ratio, sg_empty)
+{
+	test_sched_sg_empty_sg(&sched_helper_ratio);
+}
+
+TEST(tfw_sched_ratio, sched_sg_one_srv_zero_conn)
+{
+	test_sched_sg_one_srv_zero_conn(&sched_helper_ratio);
+}
+
+TEST(tfw_sched_ratio, sched_sg_one_srv_max_conn)
+{
+	size_t i, j;
+	long long conn_acc = 0, conn_acc_check = 0;
+
+	TfwSrvGroup *sg = test_create_sg("test");
+	TfwServer *srv = test_create_srv("127.0.0.1", sg);
+	TfwSrvConn *srv_conn;
+
+	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) {
+		srv_conn = test_create_srv_conn(srv);
+		conn_acc ^= (long long)srv_conn;
+	}
+
+	sg->flags = sched_helper_ratio.flags;
+	test_start_sg(sg, sched_helper_ratio.sched);
+
+	/*
+	 * Check that connections are scheduled in fair way:
+	 * every connection will be scheduled only once
+	 */
+	for (i = 0; i < sched_helper_ratio.conn_types; ++i) {
+		TfwMsg *msg = sched_helper_ratio.get_sched_arg(i);
+		conn_acc_check = 0;
+
+		for (j = 0; j < srv->conn_n; ++j) {
+			srv_conn = sg->sched->sched_sg_conn(msg, sg);
+			EXPECT_NOT_NULL(srv_conn);
+			if (!srv_conn)
+				goto err;
+
+			conn_acc_check ^= (long long)srv_conn;
+			tfw_srv_conn_put(srv_conn);
+			/*
+			 * Don't let the kernel watchdog decide
+			 * that we are stuck in locked context.
+			 */
+			kernel_fpu_end();
+			schedule();
+			kernel_fpu_begin();
+		}
+
+		sched_helper_ratio.free_sched_arg(msg);
+		EXPECT_EQ(conn_acc, conn_acc_check);
+		sched_helper_ratio.free_sched_arg(msg);
+	}
+err:
+	test_conn_release_all(sg);
+	test_sg_release_all();
+}
+
+TEST(tfw_sched_ratio, sched_sg_max_srv_zero_conn)
+{
+	test_sched_sg_max_srv_zero_conn(&sched_helper_ratio);
+}
+
+TEST(tfw_sched_ratio, sched_sg_max_srv_max_conn)
+{
+	unsigned long i, j;
+	long long conn_acc = 0, conn_acc_check = 0;
+
+	TfwSrvGroup *sg = test_create_sg("test");
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+
+	for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) {
+		srv = test_create_srv("127.0.0.1", sg);
+
+		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
+			srv_conn = test_create_srv_conn(srv);
+			conn_acc ^= (long long)srv_conn;
+		}
+	}
+
+	sg->flags = sched_helper_ratio.flags;
+	test_start_sg(sg, sched_helper_ratio.sched);
+
+	/*
+	 * Check that connections are scheduled in fair way:
+	 * every connection will be scheduled only once
+	 */
+	for (i = 0; i < sched_helper_ratio.conn_types; ++i) {
+		TfwMsg *msg = sched_helper_ratio.get_sched_arg(i);
+		conn_acc_check = 0;
+
+		for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) {
+			srv_conn = sg->sched->sched_sg_conn(msg, sg);
+			EXPECT_NOT_NULL(srv_conn);
+			if (!srv_conn)
+				goto err;
+
+			conn_acc_check ^= (long long)srv_conn;
+			tfw_srv_conn_put(srv_conn);
+		}
+
+		sched_helper_ratio.free_sched_arg(msg);
+		EXPECT_EQ(conn_acc, conn_acc_check);
+		sched_helper_ratio.free_sched_arg(msg);
+	}
+err:
+	test_conn_release_all(sg);
+	test_sg_release_all();
+}
+
+TEST(tfw_sched_ratio, sched_srv_one_srv_zero_conn)
+{
+	test_sched_srv_one_srv_zero_conn(&sched_helper_ratio);
+}
+
+TEST(tfw_sched_ratio, sched_srv_one_srv_max_conn)
+{
+	size_t i, j;
+	long long conn_acc = 0, conn_acc_check = 0;
+
+	TfwSrvGroup *sg = test_create_sg("test");
+	TfwServer *srv = test_create_srv("127.0.0.1", sg);
+	TfwSrvConn *srv_conn;
+
+	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) {
+		srv_conn = test_create_srv_conn(srv);
+		conn_acc ^= (long long)srv_conn;
+	}
+
+	sg->flags = sched_helper_ratio.flags;
+	test_start_sg(sg, sched_helper_ratio.sched);
+
+	/*
+	 * Check that connections are scheduled in fair way:
+	 * every connection will be scheduled only once
+	 */
+	for (i = 0; i < sched_helper_ratio.conn_types; ++i) {
+		TfwMsg *msg = sched_helper_ratio.get_sched_arg(i);
+		conn_acc_check = 0;
+
+		for (j = 0; j < srv->conn_n; ++j) {
+			srv_conn = sg->sched->sched_srv_conn(msg, srv);
+			EXPECT_NOT_NULL(srv_conn);
+			if (!srv_conn)
+				goto err;
+			EXPECT_EQ((TfwServer *)srv_conn->peer, srv);
+
+			conn_acc_check ^= (long long)srv_conn;
+			tfw_srv_conn_put(srv_conn);
+
+			/*
+			 * Don't let the kernel watchdog decide
+			 * that we are stuck in locked context.
+			 */
+			kernel_fpu_end();
+			schedule();
+			kernel_fpu_begin();
+		}
+
+		EXPECT_EQ(conn_acc, conn_acc_check);
+		sched_helper_ratio.free_sched_arg(msg);
+	}
+err:
+	test_conn_release_all(sg);
+	test_sg_release_all();
+}
+
+TEST(tfw_sched_ratio, sched_srv_max_srv_zero_conn)
+{
+	test_sched_srv_max_srv_zero_conn(&sched_helper_ratio);
+}
+
+TEST(tfw_sched_ratio, sched_srv_max_srv_max_conn)
+{
+	size_t i, j;
+	long long conn_acc_check = 0;
+	struct {
+		TfwServer *srv;
+		long long conn_acc;
+	} srv_acc[TFW_TEST_SG_MAX_SRV_N] = { 0 };
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+
+	TfwSrvGroup *sg = test_create_sg("test");
+
+	for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) {
+		srv = test_create_srv("127.0.0.1", sg);
+		srv_acc[i].srv = srv;
+
+		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
+			srv_conn = test_create_srv_conn(srv);
+			srv_acc[i].conn_acc ^= (long long)srv_conn;
+		}
+	}
+
+	sg->flags = sched_helper_ratio.flags;
+	test_start_sg(sg, sched_helper_ratio.sched);
+
+	/*
+	 * Check that connections are scheduled in fair way:
+	 * every connection will be scheduled only once
+	 */
+	for (i = 0; i < sched_helper_ratio.conn_types; ++i) {
+		TfwMsg *msg = sched_helper_ratio.get_sched_arg(i);
+
+		list_for_each_entry(srv, &sg->srv_list, list) {
+			size_t k = 0;
+			conn_acc_check = 0;
+
+			for (j = 0; j < srv->conn_n; ++j) {
+				srv_conn = sg->sched->sched_srv_conn(msg, srv);
+				EXPECT_NOT_NULL(srv_conn);
+				if (!srv_conn)
+					goto err;
+				EXPECT_EQ((TfwServer *)srv_conn->peer, srv);
+
+				conn_acc_check ^= (long long)srv_conn;
+				tfw_srv_conn_put(srv_conn);
+
+				/*
+				 * Don't let the kernel watchdog decide
+				 * that we are stuck in locked context.
+				 */
+				kernel_fpu_end();
+				schedule();
+				kernel_fpu_begin();
+			}
+
+			for (k = 0; k < srv->conn_n; ++k) {
+				if (srv_acc[k].srv == srv)
+					EXPECT_EQ(srv_acc[k].conn_acc,
+						  conn_acc_check);
+			}
+		}
+		sched_helper_ratio.free_sched_arg(msg);
+	}
+err:
+	test_conn_release_all(sg);
+	test_sg_release_all();
+}
+
+TEST(tfw_sched_ratio, sched_srv_offline_srv)
+{
+	test_sched_srv_offline_srv(&sched_helper_ratio);
+}
+
+TEST_SUITE(sched_ratio)
+{
+	kernel_fpu_end();
+
+	tfw_server_init();
+	tfw_sched_ratio_init();
+
+	kernel_fpu_begin();
+
+	/*
+	 * Schedulers have the same interface so some test cases can use generic
+	 * implementations. Some test cases still have to know how scheduler
+	 * work at low level. Please, keep same structure for implementation
+	 * aware test cases across all schedulers.
+	 *
+	 * Implementation aware cases:
+	 * sched_sg_one_srv_max_conn
+	 * sched_sg_max_srv_max_conn
+	 * sched_srv_one_srv_max_conn
+	 * sched_srv_max_srv_max_conn
+	 */
+
+	TEST_RUN(tfw_sched_ratio, sg_empty);
+
+	TEST_RUN(tfw_sched_ratio, sched_sg_one_srv_zero_conn);
+	TEST_RUN(tfw_sched_ratio, sched_sg_one_srv_max_conn);
+	TEST_RUN(tfw_sched_ratio, sched_sg_max_srv_zero_conn);
+	TEST_RUN(tfw_sched_ratio, sched_sg_max_srv_max_conn);
+
+	TEST_RUN(tfw_sched_ratio, sched_srv_one_srv_zero_conn);
+	TEST_RUN(tfw_sched_ratio, sched_srv_one_srv_max_conn);
+	TEST_RUN(tfw_sched_ratio, sched_srv_max_srv_zero_conn);
+	TEST_RUN(tfw_sched_ratio, sched_srv_max_srv_max_conn);
+	TEST_RUN(tfw_sched_ratio, sched_srv_offline_srv);
+}

From 5640772a16cc914d1a75040e2ed14fb5757020a2 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 4 Apr 2017 16:17:35 +0300
Subject: [PATCH 14/37] Update the docs with new options related to ratio
 scheduler.

---
 README.md | 68 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 5be0c0b1a..77bf5113e 100644
--- a/README.md
+++ b/README.md
@@ -388,13 +388,17 @@ location prefix "/society/" {
 A back end HTTP server is defined with `server` directive. The full syntax is
 as follows:
 ```
-server <IPADDR>[:<PORT>] [conns_n=<N>];
+server <IPADDR>[:<PORT>] [conns_n=<N>] [weight=<N>];
 ```
-`IPADDR` can be either IPv4 or IPv6 address. Hostnames are not allowed.
+* `IPADDR` can be either IPv4 or IPv6 address. Hostnames are not allowed.
 IPv6 address must be enclosed in square brackets (e.g. "[::0]" but not "::0").
-`PORT` defaults to 80 if not specified.
-`conns_n=<N>` is the number of parallel connections to the server.
+* `PORT` defaults to 80 if not specified.
+* `conns_n=<N>` is the number of parallel connections to the server.
 `N` defaults to 32 if not specified.
+* `weight=<N>` is the static weight of the server. The weight must be
+in the range of 1 to 100. If not specified, then the default weight of 50
+is used with the static ratio scheduler. Just the weight that differs from
+default value may be specified for convenience.
 
 Multiple back end servers may be defined. For example:
 ```
@@ -469,7 +473,8 @@ If a server goes offline, then other servers in a group take the load.
 The full syntax is as follows:
 ```
 srv_group <NAME> {
-	server <IPADDR>[:<PORT>] [conns_n=<N>];
+	sched <SCHED_NAME>;
+	server <IPADDR>[:<PORT>] [conns_n=<N>] [weight=<N>];
 	...
 }
 ```
@@ -502,14 +507,16 @@ Scheduler is used to distribute load among servers within a group. The group
 can be either explicit, defined with `srv_group` directive, or implicit.
 The syntax is as follows:
 ```
-sched <SCHED_NAME>;
+sched <SCHED_NAME> [OPTIONS];
 ```
 `SCHED_NAME` is the name of a scheduler available in Tempesta.
+`OPTIONS` are optional. Not all schedulers have additional options.
 
 Currently there are two schedulers available:
-* **round-robin** - Rotates all servers in a group in round-robin manner so
-that requests are distributed uniformly across servers. This is the default
-scheduler.
+* **ratio** - Balances the load across servers in a group based on each
+server's weight. Requests are forwarded more to servers with more weight,
+and less to servers with less weight. As a result, each server in a group
+receives an optimal load. This is the default scheduler.
 * **hash** - Chooses a server based on a URI/Host hash of a request.
 Requests are distributed uniformly, and requests with the same URI/Host are
 always sent to the same server.
@@ -527,9 +534,46 @@ A scheduler defined for the implicit group becomes the scheduler for an
 explicit group defined with `srv_group` directive if the explicit group
 is missing the `sched` directive.
 
-If no scheduler is defined for a group, then scheduler defaults
-to `round-robin`.
-
+If no scheduler is defined, then scheduler defaults to `ratio`.
+
+**ratio** scheduler may have the following options:
+* **static** - The weight of each server in a group is defined statically
+with `[weight=<NN>]` option of the `server` directive. This is the default
+`ratio` scheduler option.
+* **dynamic** - The weight of each server in a group is defined dynamically.
+Specific type of dynamic weight is specified with additional options:
+    * **minimum** - The current minimum response time from a server;
+    * **maximum** - The current maximum response time from a server;
+    * **average** - The current average response time from a server;
+    * **percentile `[<NN>]`** - The current response time from a server that
+    is within specified percentile. The percentile may be one of 50, 75, 90,
+    95, 99. If none is given, then the default percentile of 90 is used.
+If a specific type of dynamic weight is not specified, then the default type
+of `average` is used.
+
+Naturally, if a dynamic scheduler is specified for a group, and there's
+a server in that group with the `weight` option, then an error is produced
+as that combination is incompatible.
+
+The following are examples of scheduler specification in configuration.
+Again, only one `sched` directive is allowed per group.
+```
+# Use hash scheduler
+sched hash;
+# Use ratio scheduler. By default, static weight distribution is used.
+sched ratio;
+# Use ratio scheduler with static weight distribution.
+sched ratio static;
+# Use dynamic scheduler. By default, current average response time is used
+# for weight distribution.
+sched dynamic;
+# Use dynamic scheduler with maximum response time for weight distribution.
+sched dynamic maximum;
+# Use dynamic scheduler, default percentile of 90 is used.
+sched dynamic percentile;
+# Use dynamic scheduler, percentile of 75 is used for weight distribution.
+sched dynamic percentile 75;
+```
 
 #### HTTP Scheduler
 

From c55bf5f62826400c948617b29c33317b47054afc Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 4 Apr 2017 16:50:29 +0300
Subject: [PATCH 15/37] Better description of ratio scheduler behavior and the
 weight option.

---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 77bf5113e..2d6dbc53f 100644
--- a/README.md
+++ b/README.md
@@ -516,7 +516,9 @@ Currently there are two schedulers available:
 * **ratio** - Balances the load across servers in a group based on each
 server's weight. Requests are forwarded more to servers with more weight,
 and less to servers with less weight. As a result, each server in a group
-receives an optimal load. This is the default scheduler.
+receives an optimal load. In default configuration where weights are not
+specified, servers weights are considered equal, and the scheduler works
+in pure round-robin fashion. This is the default scheduler.
 * **hash** - Chooses a server based on a URI/Host hash of a request.
 Requests are distributed uniformly, and requests with the same URI/Host are
 always sent to the same server.
@@ -575,6 +577,16 @@ sched dynamic percentile;
 sched dynamic percentile 75;
 ```
 
+Servers should be grouped together with proper care. Server groups should
+be created with servers that handle similar resources. For instance, if
+servers with static content that is served quickly are grouped together
+with servers with dynamic content that is I/O bound, then the quick
+response times from servers with static content will be nearly invisible
+in comparison to longer response times from servers with dynamic content.
+In that case the distribution of load among these servers will be severely
+skewed.
+
+
 #### HTTP Scheduler
 
 HTTP scheduler plays a special role as it distributes HTTP requests among

From bab5e182a4d4192a308e1510600832d5bdceb905 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 4 Apr 2017 17:19:40 +0300
Subject: [PATCH 16/37] Remove RR scheduler. It's now replaced by Ratio
 scheduler.

---
 tempesta_fw/sched/Makefile           |   3 +-
 tempesta_fw/sched/tfw_sched_rr.c     | 264 --------------------
 tempesta_fw/sock_srv.c               |   4 +-
 tempesta_fw/t/unit/Makefile          |   1 -
 tempesta_fw/t/unit/sched_helper.h    |   1 -
 tempesta_fw/t/unit/test.c            |   2 -
 tempesta_fw/t/unit/test_sched_http.c |  30 +--
 tempesta_fw/t/unit/test_sched_rr.c   | 345 ---------------------------
 8 files changed, 18 insertions(+), 632 deletions(-)
 delete mode 100644 tempesta_fw/sched/tfw_sched_rr.c
 delete mode 100644 tempesta_fw/t/unit/test_sched_rr.c

diff --git a/tempesta_fw/sched/Makefile b/tempesta_fw/sched/Makefile
index 660bda481..bafb70706 100644
--- a/tempesta_fw/sched/Makefile
+++ b/tempesta_fw/sched/Makefile
@@ -20,5 +20,4 @@
 EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/../ -I$(src)/../../tempesta_db/core
 EXTRA_CFLAGS += $(TTLS_CFLAGS)
 
-obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_ratio.o tfw_sched_rr.o
-### obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_rr.o
+obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_ratio.o
diff --git a/tempesta_fw/sched/tfw_sched_rr.c b/tempesta_fw/sched/tfw_sched_rr.c
deleted file mode 100644
index fa0f618b8..000000000
--- a/tempesta_fw/sched/tfw_sched_rr.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/**
- *		Tempesta FW
- *
- * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2017 Tempesta Technologies, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "tempesta_fw.h"
-#include "log.h"
-#include "server.h"
-
-MODULE_AUTHOR(TFW_AUTHOR);
-MODULE_DESCRIPTION("Tempesta round-robin scheduler");
-MODULE_VERSION("0.3.0");
-MODULE_LICENSE("GPL");
-
-/**
- * List of connections to an upstream server.
- * Connections can go up and down during failover process. Only
- * fully established connections are considered by the scheduler.
- */
-typedef struct {
-	atomic64_t	rr_counter;
-	size_t		conn_n;
-	TfwServer	*srv;
-	TfwSrvConn	**conns;
-} TfwRrSrv;
-
-/**
- * List of upstream servers.
- * The list is considered static, i.e. all servers, either dead
- * or alive, are present in the list during the whole run-time.
- * That may change in the future.
- */
-typedef struct {
-	atomic64_t	rr_counter;
-	size_t		srv_n;
-	TfwRrSrv	*srvs;
-} TfwRrSrvList;
-
-static inline TfwSrvConn *
-__sched_srv(TfwRrSrv *srv_cl, int skipnip, int *nipconn)
-{
-	size_t c;
-
-	for (c = 0; c < srv_cl->conn_n; ++c) {
-		unsigned long idxval = atomic64_inc_return(&srv_cl->rr_counter);
-		TfwSrvConn *srv_conn = srv_cl->conns[idxval % srv_cl->conn_n];
-
-		if (unlikely(tfw_srv_conn_restricted(srv_conn)
-			     || tfw_srv_conn_queue_full(srv_conn)))
-			continue;
-		if (skipnip && tfw_srv_conn_hasnip(srv_conn)) {
-			if (likely(tfw_srv_conn_live(srv_conn)))
-				++(*nipconn);
-			continue;
-		}
-		if (likely(tfw_srv_conn_get_if_live(srv_conn)))
-			return srv_conn;
-	}
-
-	return NULL;
-}
-
-/**
- * On each subsequent call the function returns the next server in the
- * group. Parallel connections to the same server are also rotated in
- * the round-robin manner.
- *
- * Dead connections and servers w/o live connections are skipped.
- * Initially, connections with non-idempotent requests are also skipped
- * in attempt to increase throughput. However, if all live connections
- * contain a non-idempotent request, then re-run the algorithm and get
- * the first live connection they way it is usually done.
- *
- * RR scheduler must be the fastest scheduler. Also, it's essential
- * to maintain strict round-robin fashion of getting the next server.
- * Usually the optimistic approach gives the fastest solution: we are
- * optimistic in that there are not many non-idempotent requests, and
- * there are available server connections.
- */
-static TfwSrvConn *
-tfw_sched_rr_get_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
-{
-	size_t s;
-	int skipnip = 1, nipconn = 0;
-	TfwRrSrvList *sl = sg->sched_data;
-
-	BUG_ON(!sl);
-rerun:
-	for (s = 0; s < sl->srv_n; ++s) {
-		unsigned long idxval = atomic64_inc_return(&sl->rr_counter);
-		TfwRrSrv *srv_cl = &sl->srvs[idxval % sl->srv_n];
-		TfwSrvConn *srv_conn;
-
-		if ((srv_conn = __sched_srv(srv_cl, skipnip, &nipconn)))
-			return srv_conn;
-	}
-	if (skipnip && nipconn) {
-		skipnip = 0;
-		goto rerun;
-	}
-
-	return NULL;
-}
-
-/**
- * Same as @tfw_sched_rr_get_sg_conn(), but but schedule for a specific server
- * in a group.
- */
-static TfwSrvConn *
-tfw_sched_rr_get_srv_conn(TfwMsg *msg, TfwServer *srv)
-{
-	int skipnip = 1, nipconn = 0;
-	TfwRrSrv *srv_cl = srv->sched_data;
-	TfwSrvConn *srv_conn;
-
-	/*
-	 * For @srv without connections srv_cl will be NULL, that normally
-	 * does not happen in real life, but unit tests check that case.
-	*/
-	if (unlikely(!srv_cl))
-		return NULL;
-
-rerun:
-	if ((srv_conn = __sched_srv(srv_cl, skipnip, &nipconn)))
-		return srv_conn;
-
-	if (skipnip && nipconn) {
-		skipnip = 0;
-		goto rerun;
-	}
-
-	return NULL;
-}
-
-static void
-tfw_sched_rr_cleanup(TfwSrvGroup *sg)
-{
-	size_t si;
-	TfwRrSrvList *sl = sg->sched_data;
-
-	if (!sl)
-		return;
-
-	for (si = 0; si < sg->srv_n; ++si)
-		if (sl->srvs[si].conns)
-			kfree(sl->srvs[si].conns);
-
-	kfree(sl);
-	sg->sched_data = NULL;
-}
-
-static void
-tfw_sched_rr_del_grp(TfwSrvGroup *sg)
-{
-	tfw_sched_rr_cleanup(sg);
-}
-
-static int
-tfw_sched_rr_add_grp(TfwSrvGroup *sg)
-{
-	int ret = -ENOMEM;
-	size_t size, si, ci;
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-	TfwRrSrv *rrsrv;
-	TfwRrSrvList *sl;
-
-	/*
-	 * Validate the number of servers in the group, and the number
-	 * of connections for each server.
-	 */
-	si = 0;
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		ci = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			++ci;
-		if (ci > srv->conn_n)
-			return -EINVAL;
-		++si;
-	}
-	if (si > sg->srv_n)
-		return -EINVAL;
-
-	size = sizeof(TfwRrSrvList) + sizeof(TfwRrSrv) * sg->srv_n;
-	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
-		return -ENOMEM;
-	sl = sg->sched_data;
-	sl->srvs = sg->sched_data + sizeof(TfwRrSrvList);
-	sl->srv_n = sg->srv_n;
-
-	rrsrv = sl->srvs;
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		size = sizeof(rrsrv->conns[0]) * srv->conn_n;
-		if (!(rrsrv->conns = kzalloc(size, GFP_KERNEL)))
-			goto cleanup;
-		ci = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			rrsrv->conns[ci++] = srv_conn;
-		rrsrv->conn_n = srv->conn_n;
-		rrsrv->srv = srv;
-		srv->sched_data = rrsrv;
-		++rrsrv;
-	}
-
-	return 0;
-
-cleanup:
-	tfw_sched_rr_cleanup(sg);
-	return ret;
-}
-
-/**
- * Add a connection and a server, if new, to the scheduler.
- * Called at configuration stage, no synchronization is required.
- */
-static void
-tfw_sched_rr_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
-{
-}
-
-static TfwScheduler tfw_sched_rr = {
-	.name		= "round-robin",
-	.list		= LIST_HEAD_INIT(tfw_sched_rr.list),
-	.add_grp	= tfw_sched_rr_add_grp,
-	.del_grp	= tfw_sched_rr_del_grp,
-	.add_conn	= tfw_sched_rr_add_conn,
-	.sched_sg_conn	= tfw_sched_rr_get_sg_conn,
-	.sched_srv_conn	= tfw_sched_rr_get_srv_conn,
-};
-
-int
-tfw_sched_rr_init(void)
-{
-	TFW_DBG("sched_rr: init\n");
-	return tfw_sched_register(&tfw_sched_rr);
-}
-module_init(tfw_sched_rr_init);
-
-void
-tfw_sched_rr_exit(void)
-{
-	TFW_DBG("sched_rr: exit\n");
-	tfw_sched_unregister(&tfw_sched_rr);
-}
-module_exit(tfw_sched_rr_exit);
-
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 17c278610..6e6ff4898 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -1243,7 +1243,7 @@ static TfwCfgSpec tfw_srv_group_specs[] = {
 		.cleanup = tfw_clean_srv_groups
 	},
 	{
-		"sched", "round-robin",
+		"sched", "ratio",
 		tfw_cfgop_in_sched,
 		.allow_none = true,
 		.allow_repeat = false,
@@ -1307,7 +1307,7 @@ TfwCfgMod tfw_sock_srv_cfg_mod = {
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
-			"sched", "round-robin",
+			"sched", "ratio",
 			tfw_cfgop_out_sched,
 			.allow_none = true,
 			.allow_repeat = false,
diff --git a/tempesta_fw/t/unit/Makefile b/tempesta_fw/t/unit/Makefile
index 4fd70dd41..57c6ff6d3 100644
--- a/tempesta_fw/t/unit/Makefile
+++ b/tempesta_fw/t/unit/Makefile
@@ -37,7 +37,6 @@ tfw_test-objs = \
 	test_http_parser.o \
 	sched_helper.o \
 	test_sched_ratio.o \
-	test_sched_rr.o \
 	test_sched_hash.o \
 	test_sched_http.o \
 	test_http_sticky.o \
diff --git a/tempesta_fw/t/unit/sched_helper.h b/tempesta_fw/t/unit/sched_helper.h
index b8a3c3a6f..f39d8ae82 100644
--- a/tempesta_fw/t/unit/sched_helper.h
+++ b/tempesta_fw/t/unit/sched_helper.h
@@ -32,7 +32,6 @@
 	(TFW_TEST_SG_MAX_SRV_N * TFW_TEST_SRV_MAX_CONN_N)
 
 int tfw_server_init(void);
-int tfw_sched_rr_init(void);
 int tfw_sched_ratio_init(void);
 void sched_helper_init(void);
 
diff --git a/tempesta_fw/t/unit/test.c b/tempesta_fw/t/unit/test.c
index c6a31cd7c..8829c1c99 100644
--- a/tempesta_fw/t/unit/test.c
+++ b/tempesta_fw/t/unit/test.c
@@ -96,7 +96,6 @@ TEST_SUITE(http_match);
 TEST_SUITE(hash);
 TEST_SUITE(addr);
 TEST_SUITE(sched_ratio);
-TEST_SUITE(sched_rr);
 TEST_SUITE(sched_hash);
 TEST_SUITE(sched_http);
 
@@ -122,7 +121,6 @@ test_run_all(void)
 	TEST_SUITE_RUN(hash);
 	TEST_SUITE_RUN(addr);
 	TEST_SUITE_RUN(sched_ratio);
-	TEST_SUITE_RUN(sched_rr);
 	TEST_SUITE_RUN(sched_hash);
 	TEST_SUITE_RUN(sched_http);
 
diff --git a/tempesta_fw/t/unit/test_sched_http.c b/tempesta_fw/t/unit/test_sched_http.c
index 1ec422ba2..6d0fa8538 100644
--- a/tempesta_fw/t/unit/test_sched_http.c
+++ b/tempesta_fw/t/unit/test_sched_http.c
@@ -112,7 +112,7 @@ TEST(tfw_sched_http, one_rule_and_zero_conns)
 {
 	TfwSrvGroup *sg = test_create_sg("default");
 	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg, "round-robin");
+	test_start_sg(sg, "ratio");
 
 	if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) {
 		TEST_FAIL("can't parse rules\n");
@@ -134,7 +134,7 @@ TEST(tfw_sched_http, one_wildcard_rule)
 	srv = test_create_srv("127.0.0.1", sg);
 	expect_conn = test_create_srv_conn(srv);
 	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg, "round-robin");
+	test_start_sg(sg, "ratio");
 
 	if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) {
 		TEST_FAIL("can't parse rules\n");
@@ -160,61 +160,61 @@ TEST(tfw_sched_http, some_rules)
 	srv = test_create_srv("127.0.0.1", sg1);
 	expect_conn1 = test_create_srv_conn(srv);
 	sg1->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg1, "round-robin");
+	test_start_sg(sg1, "ratio");
 
 	sg2 = test_create_sg("sg2");
 	srv = test_create_srv("127.0.0.1", sg2);
 	expect_conn2 = test_create_srv_conn(srv);
 	sg2->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg2, "round-robin");
+	test_start_sg(sg2, "ratio");
 
 	sg3 = test_create_sg("sg3");
 	srv = test_create_srv("127.0.0.1", sg3);
 	expect_conn3 = test_create_srv_conn(srv);
 	sg3->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg3, "round-robin");
+	test_start_sg(sg3, "ratio");
 
 	sg4 = test_create_sg("sg4");
 	srv = test_create_srv("127.0.0.1", sg4);
 	expect_conn4 = test_create_srv_conn(srv);
 	sg4->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg4, "round-robin");
+	test_start_sg(sg4, "ratio");
 
 	sg5 = test_create_sg("sg5");
 	srv = test_create_srv("127.0.0.1", sg5);
 	expect_conn5 = test_create_srv_conn(srv);
 	sg5->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg5, "round-robin");
+	test_start_sg(sg5, "ratio");
 
 	sg6 = test_create_sg("sg6");
 	srv = test_create_srv("127.0.0.1", sg6);
 	expect_conn6 = test_create_srv_conn(srv);
 	sg6->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg6, "round-robin");
+	test_start_sg(sg6, "ratio");
 
 	sg7 = test_create_sg("sg7");
 	srv = test_create_srv("127.0.0.1", sg7);
 	expect_conn7 = test_create_srv_conn(srv);
 	sg7->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg7, "round-robin");
+	test_start_sg(sg7, "ratio");
 
 	sg8 = test_create_sg("sg8");
 	srv = test_create_srv("127.0.0.1", sg8);
 	expect_conn8 = test_create_srv_conn(srv);
 	sg8->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg8, "round-robin");
+	test_start_sg(sg8, "ratio");
 
 	sg9 = test_create_sg("sg9");
 	srv = test_create_srv("127.0.0.1", sg9);
 	expect_conn9 = test_create_srv_conn(srv);
 	sg9->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg9, "round-robin");
+	test_start_sg(sg9, "ratio");
 
 	sg10 = test_create_sg("sg10");
 	srv = test_create_srv("127.0.0.1", sg10);
 	expect_conn10 = test_create_srv_conn(srv);
 	sg10->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg10, "round-robin");
+	test_start_sg(sg10, "ratio");
 
 	if (parse_cfg("sched_http_rules {\nmatch sg1 uri eq /foo;\n\
 	                                   match sg2 uri prefix /foo/bar;\n\
@@ -330,7 +330,7 @@ TEST(tfw_sched_http, one_rule)
 		srv = test_create_srv("127.0.0.1", sg);
 		expect_conn = test_create_srv_conn(srv);
 		sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-		test_start_sg(sg, "round-robin");
+		test_start_sg(sg, "ratio");
 
 		if (parse_cfg(test_cases[i].rule_str)) {
 			TEST_FAIL("can't parse rules\n");
@@ -351,9 +351,9 @@ TEST_SUITE(sched_http)
 
 	kernel_fpu_end();
 
-	s = tfw_sched_lookup("round-robin");
+	s = tfw_sched_lookup("ratio");
 	if (!s)
-		tfw_sched_rr_init();
+		tfw_sched_ratio_init();
 	tfw_sched_http_init();
 	tfw_server_init();
 
diff --git a/tempesta_fw/t/unit/test_sched_rr.c b/tempesta_fw/t/unit/test_sched_rr.c
deleted file mode 100644
index 683754d0f..000000000
--- a/tempesta_fw/t/unit/test_sched_rr.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/**
- *		Tempesta FW
- *
- * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
- * Copyright (C) 2015-2017 Tempesta Technologies, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-#include <asm/fpu/api.h>
-
-#undef tfw_sock_srv_init
-#define tfw_sock_srv_init test_rr_sock_srv_conn_init
-#undef tfw_sock_srv_exit
-#define tfw_sock_srv_exit test_rr_sock_srv_exit
-#undef tfw_srv_conn_release
-#define tfw_srv_conn_release test_rr_srv_conn_release
-#undef tfw_sock_srv_cfg_mod
-#define tfw_sock_srv_cfg_mod test_rr_srv_cfg_mod
-
-#include "sock_srv.c"
-
-#ifdef module_init
-#undef module_init
-#undef module_exit
-#define module_init(func)
-#define module_exit(func)
-#endif
-
-#include "../../sched/tfw_sched_rr.c"
-
-#include "sched_helper.h"
-#include "server.h"
-#include "test.h"
-
-static TfwMsg *
-sched_rr_get_arg(size_t conn_type __attribute__((unused)))
-{
-	return NULL;
-}
-
-static void
-sched_rr_free_arg(TfwMsg *msg __attribute__((unused)))
-{
-}
-
-static struct TestSchedHelper sched_helper_rr = {
-	.sched = "round-robin",
-	.conn_types = 1,
-	.get_sched_arg = &sched_rr_get_arg,
-	.free_sched_arg = &sched_rr_free_arg,
-};
-
-TEST(tfw_sched_rr, sg_empty)
-{
-	test_sched_sg_empty_sg(&sched_helper_rr);
-}
-
-TEST(tfw_sched_rr, sched_sg_one_srv_zero_conn)
-{
-	test_sched_sg_one_srv_zero_conn(&sched_helper_rr);
-}
-
-TEST(tfw_sched_rr, sched_sg_one_srv_max_conn)
-{
-	size_t i, j;
-	long long conn_acc = 0, conn_acc_check = 0;
-
-	TfwSrvGroup *sg = test_create_sg("test");
-	TfwServer *srv = test_create_srv("127.0.0.1", sg);
-	TfwSrvConn *srv_conn;
-
-	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) {
-		srv_conn = test_create_srv_conn(srv);
-		conn_acc ^= (long long)srv_conn;
-	}
-
-	test_start_sg(sg, sched_helper_rr.sched);
-
-	/*
-	 * Check that connections are scheduled in fair way:
-	 * every connection will be scheduled only once
-	 */
-	for (i = 0; i < sched_helper_rr.conn_types; ++i) {
-		TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
-		conn_acc_check = 0;
-
-		for (j = 0; j < srv->conn_n; ++j) {
-			srv_conn = sg->sched->sched_sg_conn(msg, sg);
-			EXPECT_NOT_NULL(srv_conn);
-			if (!srv_conn)
-				goto err;
-
-			conn_acc_check ^= (long long)srv_conn;
-			tfw_srv_conn_put(srv_conn);
-			/*
-			 * Don't let the kernel watchdog decide
-			 * that we are stuck in locked context.
-			 */
-			kernel_fpu_end();
-			schedule();
-			kernel_fpu_begin();
-		}
-
-		sched_helper_rr.free_sched_arg(msg);
-		EXPECT_EQ(conn_acc, conn_acc_check);
-		sched_helper_rr.free_sched_arg(msg);
-	}
-err:
-	test_conn_release_all(sg);
-	test_sg_release_all();
-}
-
-TEST(tfw_sched_rr, sched_sg_max_srv_zero_conn)
-{
-	test_sched_sg_max_srv_zero_conn(&sched_helper_rr);
-}
-
-TEST(tfw_sched_rr, sched_sg_max_srv_max_conn)
-{
-	unsigned long i, j;
-	long long conn_acc = 0, conn_acc_check = 0;
-
-	TfwSrvGroup *sg = test_create_sg("test");
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-
-	for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) {
-		srv = test_create_srv("127.0.0.1", sg);
-
-		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
-			srv_conn = test_create_srv_conn(srv);
-			conn_acc ^= (long long)srv_conn;
-		}
-	}
-
-	test_start_sg(sg, sched_helper_rr.sched);
-
-	/*
-	 * Check that connections are scheduled in fair way:
-	 * every connection will be scheduled only once
-	 */
-	for (i = 0; i < sched_helper_rr.conn_types; ++i) {
-		TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
-		conn_acc_check = 0;
-
-		for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) {
-			srv_conn = sg->sched->sched_sg_conn(msg, sg);
-			EXPECT_NOT_NULL(srv_conn);
-			if (!srv_conn)
-				goto err;
-
-			conn_acc_check ^= (long long)srv_conn;
-			tfw_srv_conn_put(srv_conn);
-		}
-
-		sched_helper_rr.free_sched_arg(msg);
-		EXPECT_EQ(conn_acc, conn_acc_check);
-		sched_helper_rr.free_sched_arg(msg);
-	}
-err:
-	test_conn_release_all(sg);
-	test_sg_release_all();
-}
-
-TEST(tfw_sched_rr, sched_srv_one_srv_zero_conn)
-{
-	test_sched_srv_one_srv_zero_conn(&sched_helper_rr);
-}
-
-TEST(tfw_sched_rr, sched_srv_one_srv_max_conn)
-{
-	size_t i, j;
-	long long conn_acc = 0, conn_acc_check = 0;
-
-	TfwSrvGroup *sg = test_create_sg("test");
-	TfwServer *srv = test_create_srv("127.0.0.1", sg);
-	TfwSrvConn *srv_conn;
-
-	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) {
-		srv_conn = test_create_srv_conn(srv);
-		conn_acc ^= (long long)srv_conn;
-	}
-
-	test_start_sg(sg, sched_helper_rr.sched);
-
-	/*
-	 * Check that connections are scheduled in fair way:
-	 * every connection will be scheduled only once
-	 */
-	for (i = 0; i < sched_helper_rr.conn_types; ++i) {
-		TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
-		conn_acc_check = 0;
-
-		for (j = 0; j < srv->conn_n; ++j) {
-			srv_conn = sg->sched->sched_srv_conn(msg, srv);
-			EXPECT_NOT_NULL(srv_conn);
-			if (!srv_conn)
-				goto err;
-			EXPECT_EQ((TfwServer *)srv_conn->peer, srv);
-
-			conn_acc_check ^= (long long)srv_conn;
-			tfw_srv_conn_put(srv_conn);
-
-			/*
-			 * Don't let the kernel watchdog decide
-			 * that we are stuck in locked context.
-			 */
-			kernel_fpu_end();
-			schedule();
-			kernel_fpu_begin();
-		}
-
-		EXPECT_EQ(conn_acc, conn_acc_check);
-		sched_helper_rr.free_sched_arg(msg);
-	}
-err:
-	test_conn_release_all(sg);
-	test_sg_release_all();
-}
-
-TEST(tfw_sched_rr, sched_srv_max_srv_zero_conn)
-{
-	test_sched_srv_max_srv_zero_conn(&sched_helper_rr);
-}
-
-TEST(tfw_sched_rr, sched_srv_max_srv_max_conn)
-{
-	size_t i, j;
-	long long conn_acc_check = 0;
-	struct {
-		TfwServer *srv;
-		long long conn_acc;
-	} srv_acc[TFW_TEST_SG_MAX_SRV_N] = { 0 };
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-
-	TfwSrvGroup *sg = test_create_sg("test");
-
-	for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) {
-		srv = test_create_srv("127.0.0.1", sg);
-		srv_acc[i].srv = srv;
-
-		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) {
-			srv_conn = test_create_srv_conn(srv);
-			srv_acc[i].conn_acc ^= (long long)srv_conn;
-		}
-	}
-
-	test_start_sg(sg, sched_helper_rr.sched);
-
-	/*
-	 * Check that connections are scheduled in fair way:
-	 * every connection will be scheduled only once
-	 */
-	for (i = 0; i < sched_helper_rr.conn_types; ++i) {
-		TfwMsg *msg = sched_helper_rr.get_sched_arg(i);
-
-		list_for_each_entry(srv, &sg->srv_list, list) {
-			size_t k = 0;
-			conn_acc_check = 0;
-
-			for (j = 0; j < srv->conn_n; ++j) {
-				srv_conn = sg->sched->sched_srv_conn(msg, srv);
-				EXPECT_NOT_NULL(srv_conn);
-				if (!srv_conn)
-					goto err;
-				EXPECT_EQ((TfwServer *)srv_conn->peer, srv);
-
-				conn_acc_check ^= (long long)srv_conn;
-				tfw_srv_conn_put(srv_conn);
-
-				/*
-				 * Don't let the kernel watchdog decide
-				 * that we are stuck in locked context.
-				 */
-				kernel_fpu_end();
-				schedule();
-				kernel_fpu_begin();
-			}
-
-			for (k = 0; k < srv->conn_n; ++k) {
-				if (srv_acc[k].srv == srv)
-					EXPECT_EQ(srv_acc[k].conn_acc,
-						  conn_acc_check);
-			}
-		}
-		sched_helper_rr.free_sched_arg(msg);
-	}
-err:
-	test_conn_release_all(sg);
-	test_sg_release_all();
-}
-
-TEST(tfw_sched_rr, sched_srv_offline_srv)
-{
-	test_sched_srv_offline_srv(&sched_helper_rr);
-}
-
-TEST_SUITE(sched_rr)
-{
-	kernel_fpu_end();
-
-	tfw_server_init();
-	tfw_sched_rr_init();
-
-	kernel_fpu_begin();
-
-	/*
-	 * Schedulers have the same interface so some test cases can use generic
-	 * implementations. Some test cases still have to know how scheduler
-	 * work at low level. Please, keep same structure for implementation
-	 * aware test cases across all schedulers.
-	 *
-	 * Implementation aware cases:
-	 * sched_sg_one_srv_max_conn
-	 * sched_sg_max_srv_max_conn
-	 * sched_srv_one_srv_max_conn
-	 * sched_srv_max_srv_max_conn
-	 */
-
-	TEST_RUN(tfw_sched_rr, sg_empty);
-
-	TEST_RUN(tfw_sched_rr, sched_sg_one_srv_zero_conn);
-	TEST_RUN(tfw_sched_rr, sched_sg_one_srv_max_conn);
-	TEST_RUN(tfw_sched_rr, sched_sg_max_srv_zero_conn);
-	TEST_RUN(tfw_sched_rr, sched_sg_max_srv_max_conn);
-
-	TEST_RUN(tfw_sched_rr, sched_srv_one_srv_zero_conn);
-	TEST_RUN(tfw_sched_rr, sched_srv_one_srv_max_conn);
-	TEST_RUN(tfw_sched_rr, sched_srv_max_srv_zero_conn);
-	TEST_RUN(tfw_sched_rr, sched_srv_max_srv_max_conn);
-	TEST_RUN(tfw_sched_rr, sched_srv_offline_srv);
-}

From ff0cb236b844eb50f19092ce7603befd266398ea Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 7 Apr 2017 13:11:15 +0300
Subject: [PATCH 17/37] Remove .add_conn() callback. The functionality is moved
 to .add_grp().

Also, validate the integrity of a group in separate function at the time
.add_grp() callback is called. It's called just once for each server group.
---
 tempesta_fw/sched/tfw_sched_hash.c  | 46 ++++++++++-------
 tempesta_fw/sched/tfw_sched_ratio.c | 79 ++++++++++-------------------
 tempesta_fw/server.c                |  7 ---
 tempesta_fw/server.h                | 12 ++---
 tempesta_fw/sock_srv.c              | 23 ---------
 tempesta_fw/t/unit/sched_helper.c   |  7 ---
 6 files changed, 59 insertions(+), 115 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_hash.c b/tempesta_fw/sched/tfw_sched_hash.c
index cfbb1b492..55b40b8c0 100644
--- a/tempesta_fw/sched/tfw_sched_hash.c
+++ b/tempesta_fw/sched/tfw_sched_hash.c
@@ -45,7 +45,7 @@
 
 MODULE_AUTHOR(TFW_AUTHOR);
 MODULE_DESCRIPTION("Tempesta hash-based scheduler");
-MODULE_VERSION("0.3.0");
+MODULE_VERSION("0.4.0");
 MODULE_LICENSE("GPL");
 
 typedef struct {
@@ -143,7 +143,7 @@ tfw_sched_hash_get_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 {
 	TfwHashSrvList *sl = sg->sched_data;
 	unsigned long msg_hash;
-	unsigned long tries = sl->conn_n;;
+	unsigned long tries = sl->conn_n;
 
 	BUG_ON(!sl);
 
@@ -226,22 +226,19 @@ tfw_sched_hash_del_grp(TfwSrvGroup *sg)
 	tfw_sched_hash_cleanup(sg);
 }
 
+/**
+ * Validate the integrity of a group.
+ *
+ * Make sure that number of servers in the group, and the number
+ * of connections for each server match the recorded values.
+ */
 static int
-tfw_sched_hash_add_grp(TfwSrvGroup *sg)
+tfw_sched_hash_validate_grp(TfwSrvGroup *sg)
 {
-	int ret = -ENOMEM;
-	size_t size, si, ci;
-	unsigned int sum_conn_n;
+	size_t si = 0, ci;
 	TfwServer *srv;
 	TfwSrvConn *srv_conn;
-	TfwHashSrv *hsrv;
-	TfwHashSrvList *sl;
 
-	/*
-	 * Validate the number of servers in the group, and the number
-	 * of connections for each server.
-	 */
-	si = 0;
 	list_for_each_entry(srv, &sg->srv_list, list) {
 		ci = 0;
 		list_for_each_entry(srv_conn, &srv->conn_list, list)
@@ -253,6 +250,23 @@ tfw_sched_hash_add_grp(TfwSrvGroup *sg)
 	if (si > sg->srv_n)
 		return -EINVAL;
 
+	return 0;
+}
+
+static int
+tfw_sched_hash_add_grp(TfwSrvGroup *sg)
+{
+	int ret = -ENOMEM;
+	size_t size, ci;
+	unsigned int sum_conn_n;
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+	TfwHashSrv *hsrv;
+	TfwHashSrvList *sl;
+
+	if (!tfw_sched_hash_validate_grp(sg))
+		return -EINVAL;
+
 	size = sizeof(TfwHashSrvList) + sizeof(TfwHashSrv) * sg->srv_n;
 	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
 		return -ENOMEM;
@@ -289,17 +303,11 @@ tfw_sched_hash_add_grp(TfwSrvGroup *sg)
 	return ret;
 }
 
-static void
-tfw_sched_hash_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *conn)
-{
-}
-
 static TfwScheduler tfw_sched_hash = {
 	.name		= "hash",
 	.list		= LIST_HEAD_INIT(tfw_sched_hash.list),
 	.add_grp	= tfw_sched_hash_add_grp,
 	.del_grp	= tfw_sched_hash_del_grp,
-	.add_conn	= tfw_sched_hash_add_conn,
 	.sched_sg_conn	= tfw_sched_hash_get_sg_conn,
 	.sched_srv_conn	= tfw_sched_hash_get_srv_conn,
 };
diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 2da876e60..b2b1287a0 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -780,6 +780,33 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 	tfw_sched_ratio_cleanup(sg);
 }
 
+/**     
+ * Validate the integrity of a group.
+ *
+ * Make sure that number of servers in the group, and the number
+ * of connections for each server match the recorded values.
+ */
+static int
+tfw_sched_hash_validate_grp(TfwSrvGroup *sg)
+{
+	size_t si = 0, ci;
+	TfwServer *srv;
+	TfwSrvConn *srv_conn;
+
+	list_for_each_entry(srv, &sg->srv_list, list) {
+		ci = 0;
+		list_for_each_entry(srv_conn, &srv->conn_list, list)
+			++ci;
+		if (ci > srv->conn_n)
+			return -EINVAL;
+		++si;
+	}
+	if (si > sg->srv_n)
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * Add a server group to Ratio Scheduler.
  *
@@ -797,20 +824,7 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 	TfwRatioPool *rpool;
 	TfwRatioSrvDesc *trsdesc, *srvdesc;
 
-	/*
-	 * Validate the number of servers in the group, and the number
-	 * of connections for each server.
-	 */
-	si = 0;
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		ci = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			++ci;
-		if (ci > srv->conn_n)
-			return -EINVAL;
-		++si;
-	}
-	if (si > sg->srv_n)
+	if (!tfw_sched_hash_validate_grp(sg))
 		return -EINVAL;
 
 	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
@@ -879,48 +893,11 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 	return ret;
 }
 
-/**
- * Add a connection and a server, if new, to the scheduler.
- * Called at configuration stage, no synchronization is required.
- *
- * The whole server and server connections data for a group is complete
- * at the time the group is added to the scheduler with add_grp(). Thus
- * the actual role of the function is to make cure that data is the same.
- * The logic is based on the assumption that servers and connections are
- * submitted in the same order as they were when add_grp() was called.
- */
-static void
-tfw_sched_ratio_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
-{
-	static size_t si = 0, ci = 0;
-	TfwRatioPool *rpool = sg->sched_data;
-	TfwRatio *ratio;
-	TfwRatioSrvDesc *srvdesc;
-	TfwSrvConn *rconn;
-
-	BUG_ON(!rpool);
-	ratio = rpool->ratio;
-
-	/* Make sure that data is the same. */
-	srvdesc = ratio->srvdesc + si;
-	BUG_ON(srvdesc->srv != srv);
-
-	rconn = srvdesc->conns[ci];
-	BUG_ON(rconn != srv_conn);
-
-	if (++ci == srv->conn_n) {
-		ci = 0;
-		if (++si == sg->srv_n)
-			si = 0;
-	}
-}
-
 static TfwScheduler tfw_sched_ratio = {
 	.name		= "ratio",
 	.list		= LIST_HEAD_INIT(tfw_sched_ratio.list),
 	.add_grp	= tfw_sched_ratio_add_grp,
 	.del_grp	= tfw_sched_ratio_del_grp,
-	.add_conn	= tfw_sched_ratio_add_conn,
 	.sched_sg_conn	= tfw_sched_ratio_sched_sg_conn,
 	.sched_srv_conn	= tfw_sched_ratio_sched_srv_conn,
 };
diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c
index d5d52f20e..3ae4a4608 100644
--- a/tempesta_fw/server.c
+++ b/tempesta_fw/server.c
@@ -182,13 +182,6 @@ tfw_sg_add(TfwSrvGroup *sg, TfwServer *srv)
 	write_unlock(&sg->lock);
 }
 
-void
-tfw_sg_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn)
-{
-	if (sg->sched && sg->sched->add_conn)
-		sg->sched->add_conn(sg, srv, srv_conn);
-}
-
 int
 tfw_sg_set_sched(TfwSrvGroup *sg, const char *sched_name)
 {
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index 13561df7c..5e487369e 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -112,13 +112,12 @@ struct tfw_srv_group_t {
  * @name	- name of the algorithm;
  * @list	- member in the list of registered schedulers;
  * @add_grp	- add server group to the scheduler.
-		  Called in process context at configuration time.
- *		  Called only after the group is set up with all servers;
+ *		  Called in process context at configuration time.
+ *		  Called only after all servers are set up with connections,
+ *		  and the group is set up with all servers;
  * @del_grp	- delete server group from the scheduler;
- * @add_conn	- add connection and server if it's new.
-		  Called in process context at configuration time;
  * @sched_grp	- server group scheduling virtual method.
-		  Typically returns the result of @tfw_sched_get_sg_srv_conn();
+ *		  Typically returns the result of @tfw_sched_get_sg_srv_conn();
  * @sched_sg_conn	- virtual method. Schedule a request to a server from
  *			  given server group. Returns a server connection;
  * @sched_srv_conn	- schedule a request to the given server.
@@ -136,8 +135,6 @@ struct tfw_scheduler_t {
 	struct list_head	list;
 	int			(*add_grp)(TfwSrvGroup *sg);
 	void			(*del_grp)(TfwSrvGroup *sg);
-	void			(*add_conn)(TfwSrvGroup *sg, TfwServer *srv,
-					    TfwSrvConn *srv_conn);
 	TfwSrvConn		*(*sched_grp)(TfwMsg *msg);
 	TfwSrvConn		*(*sched_sg_conn)(TfwMsg *msg, TfwSrvGroup *sg);
 	TfwSrvConn		*(*sched_srv_conn)(TfwMsg *msg, TfwServer *srv);
@@ -177,7 +174,6 @@ void tfw_sg_free(TfwSrvGroup *sg);
 unsigned int tfw_sg_count(void);
 
 void tfw_sg_add(TfwSrvGroup *sg, TfwServer *srv);
-void tfw_sg_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn);
 int tfw_sg_set_sched(TfwSrvGroup *sg, const char *sched);
 int tfw_sg_for_each_srv(int (*cb)(TfwServer *srv));
 void tfw_sg_release_all(void);
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 6e6ff4898..a261e0779 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -527,24 +527,6 @@ tfw_srv_conn_free(TfwSrvConn *srv_conn)
 	kmem_cache_free(tfw_srv_conn_cache, srv_conn);
 }
 
-static inline int
-__tfw_sock_srv_sg_add_conn_cb(TfwSrvConn *srv_conn)
-{
-	TfwServer *srv = (TfwServer *)srv_conn->peer;
-	tfw_sg_add_conn(srv->sg, srv, srv_conn);
-
-	return 0;
-}
-
-static int
-tfw_sock_srv_sg_add_conns(TfwServer *srv)
-{
-	TfwSrvConn *srv_conn;
-
-	return tfw_peer_for_each_conn(srv, srv_conn, list,
-				      __tfw_sock_srv_sg_add_conn_cb);
-}
-
 static int
 tfw_sock_srv_add_conns(TfwServer *srv)
 {
@@ -1213,11 +1195,6 @@ tfw_sock_srv_start(void)
 		if ((ret = tfw_cfgop_setup_srv_group()))
 			return ret;
 	}
-	/* Add connections to scheduler for all servers in all groups. */
-	if ((ret = tfw_sg_for_each_srv(tfw_sock_srv_sg_add_conns)) != 0) {
-		TFW_ERR_NL("Error adding server connections\n");
-		return ret;
-	}
 	/*
 	 * This must be executed only after the complete configuration
 	 * has been processed as it depends on configuration directives
diff --git a/tempesta_fw/t/unit/sched_helper.c b/tempesta_fw/t/unit/sched_helper.c
index e3027c02c..1d1c7e079 100644
--- a/tempesta_fw/t/unit/sched_helper.c
+++ b/tempesta_fw/t/unit/sched_helper.c
@@ -68,9 +68,6 @@ test_create_sg(const char *name)
 void
 test_start_sg(TfwSrvGroup *sg, const char *sched_name)
 {
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-
 	kernel_fpu_end();
 
 	{
@@ -78,10 +75,6 @@ test_start_sg(TfwSrvGroup *sg, const char *sched_name)
 		BUG_ON(r);
 	}
 
-	list_for_each_entry(srv, &sg->srv_list, list)
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			sg->sched->add_conn(sg, srv, srv_conn);
-
 	kernel_fpu_begin();
 }
 

From f8c11b34feee01e3bccff12fd32f3edceb4ade77 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 7 Apr 2017 14:49:21 +0300
Subject: [PATCH 18/37] Address code review comments.

---
 tempesta_fw/sched/tfw_sched_ratio.c | 97 +++++++++++++++--------------
 1 file changed, 50 insertions(+), 47 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index b2b1287a0..2163bd311 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -28,7 +28,7 @@
 
 MODULE_AUTHOR(TFW_AUTHOR);
 MODULE_DESCRIPTION("Tempesta Ratio Scheduler");
-MODULE_VERSION("0.3.0");
+MODULE_VERSION("0.1.0");
 MODULE_LICENSE("GPL");
 
 #define TFW_SCHED_RATIO_INTVL	(HZ / 20)	/* The timer periodicity. */
@@ -73,7 +73,7 @@ typedef struct {
  *
  * @lock	- must be in the same cache line for faster operations.
  * @csidx	- index of current server data entry.
- * @reidx       - index of next server data entry which ratio we need
+ * @reidx	- index of next server data entry which ratio we need
  *		  to reset, or @srv_n if no resetting is needed.
  * @riter	- ratio iteration, indicates the number of times we need
  *		  to choose all servers before the current one until we
@@ -169,10 +169,10 @@ tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs)
  * Return a non-zero value if additional actions are needed.
  */
 static int
-tfw_sched_ratio_calc(TfwRatio *ratio, unsigned int *arg_max_val_idx)
+tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
 {
-	size_t si;
-	unsigned int diff, max_val_idx, max_wgt, oratio;
+	size_t si, max_val_idx;
+	unsigned int diff, max_wgt, oratio;
 	unsigned long unit, sum_wgt = 0, sum_ratio = 0;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 	TfwRatioSchData *schdata = &ratio->schdata;
@@ -236,8 +236,7 @@ tfw_sched_ratio_calc(TfwRatio *ratio, unsigned int *arg_max_val_idx)
 static void
 tfw_sched_ratio_calc_static(TfwRatio *ratio)
 {
-	size_t si;
-	unsigned int max_val_idx = 0;
+	size_t si, max_val_idx = 0;
 	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 
@@ -285,9 +284,8 @@ tfw_sched_ratio_calc_static(TfwRatio *ratio)
 static int
 tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 {
-	size_t si, left = 0, right = 0;
-	unsigned int recalc = 0, max_ratio = 0;
-	unsigned int has_one_val = 0, max_val_idx = 0;
+	size_t si, max_val_idx = 0, left = 0, right = 0;
+	unsigned int recalc = 0, max_ratio = 0, has_one_val = 0;
 	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
 	TfwPrcntlStats pstats = {
 		.ith = tfw_pstats_ith,
@@ -343,7 +341,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 		}
 	}
 	if (has_one_val) {
-		unsigned int orsum = ratio->schdata.orsum;
+		unsigned long orsum = ratio->schdata.orsum;
 		TfwRatioSrvData sdent_one = srvdata[si];
 		TfwRatioSrvData sdent_max = srvdata[max_val_idx];
 
@@ -377,7 +375,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 	 * end of the array. Reverse the sequence of server descriptor
 	 * indices in that part of the array.
 	 */
-	if (has_one_val) {
+	if (!has_one_val) {
 		left = 0;
 		right = ratio->srv_n - 1;
 	} else {
@@ -398,9 +396,9 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 	return 1;
 }
 
-/*
- *  * Get a free for use entry from the RCU pool.
- *   */
+/**
+ * Get a free for use entry from the RCU pool.
+ */
 static TfwRatio *
 tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
 {
@@ -415,23 +413,23 @@ tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
 		}
 	}
 
-        return NULL;
+	return NULL;
 }
 
-/*
+/**
  * Return an entry to the RCU pool.
  */
 static inline void
 __tfw_sched_ratio_rpool_put(TfwRatio *ratio)
 {
-        atomic_set(&ratio->free, 1);
-        smp_mb__after_atomic();
+	atomic_set(&ratio->free, 1);
+	smp_mb__after_atomic();
 }
 
 static void
 tfw_sched_ratio_rpool_put(struct rcu_head *rcup)
 {
-        TfwRatio *ratio = container_of(rcup, TfwRatio, rcu);
+	TfwRatio *ratio = container_of(rcup, TfwRatio, rcu);
 	__tfw_sched_ratio_rpool_put(ratio);
 }
 
@@ -515,7 +513,7 @@ tfw_sched_ratio_tmfn(unsigned long tmfn_data)
 static inline bool
 tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, size_t csidx)
 {
-	unsigned int headsum2, tailsum2;
+	unsigned long headsum2, tailsum2;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 	TfwRatioSchData *schdata = &ratio->schdata;
 
@@ -642,26 +640,26 @@ __sched_srv(TfwRatioSrvDesc *srvdesc, int skipnip, int *nipconn)
 static TfwSrvConn *
 tfw_sched_ratio_sched_srv_conn(TfwMsg *msg, TfwServer *srv)
 {
-        int skipnip = 1, nipconn = 0;
-        TfwRatioSrvDesc *srvdesc = srv->sched_data;
-        TfwSrvConn *srv_conn;
+	int skipnip = 1, nipconn = 0;
+	TfwRatioSrvDesc *srvdesc = srv->sched_data;
+	TfwSrvConn *srv_conn;
 
-        /*
+	/*
 	 * For @srv without connections @srvdesc will be NULL. Normally,
 	 * it doesn't happen in real life, but unit tests check this case.
 	 */
-        if (unlikely(!srvdesc))
-                return NULL;
+	if (unlikely(!srvdesc))
+		return NULL;
 rerun:
-        if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn)))
-                return srv_conn;
+	if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn)))
+		return srv_conn;
 
-        if (skipnip && nipconn) {
-                skipnip = 0;
-                goto rerun;
-        }
+	if (skipnip && nipconn) {
+		skipnip = 0;
+		goto rerun;
+	}
 
-        return NULL;
+	return NULL;
 }
 
 /**
@@ -686,10 +684,9 @@ tfw_sched_ratio_sched_srv_conn(TfwMsg *msg, TfwServer *srv)
 static TfwSrvConn *
 tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 {
-	size_t srv_tried_n = 0;
-        int skipnip = 1, nipconn = 0;
+	unsigned int attempts, skipnip = 1, nipconn = 0;
 	TfwRatioPool *rpool = sg->sched_data;
-	TfwRatioSrvDesc *srvdesc, *srvdesc_last = NULL;
+	TfwRatioSrvDesc *srvdesc;
 	TfwSrvConn *srv_conn;
 	TfwRatio *ratio;
 
@@ -700,28 +697,34 @@ tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 	BUG_ON(!ratio);
 rerun:
 	/*
-	 * Try each server in a group. Attempt to schedule a connection
-	 * to a server that doesn't fall under a set of restrictions.
+	 * Try servers in a group according to their ratios. Attempt to
+	 * schedule a connection that is not under a set of restrictions.
 	 *
-	 * FIXME: The way the algorithm works, same server may be chosen
+	 * NOTE: The way the algorithm works, same server may be chosen
 	 * multiple times in a row, even if that's the server where all
 	 * connections were under restrictions for one reason or another.
 	 * The idea is that the conditions for server's connections may
 	 * change any time, and so the next time one or more connections
 	 * to the same server will not be restricted.
-	 * Perhaps, though, it makes sense to skip these servers that
-	 * were restricted, and go directly to the next server. Getting
-	 * the next server reqires a lock, so perhaps it makes sense to
-	 * to skip these repetitive servers while under the lock.
+	 * Also, servers are chosen concurrently, so a particular thread
+	 * may not be able to probe all servers in a group.
+	 *
+	 * These properties suggest that a limit is needed on the number
+	 * of attempts to find the right connection. This limit appears
+	 * to be purely empirical.
+	 *
+	 * A tricky issue here is that the algorithm assumes two passes.
+	 * One runs under full set of restrictions, and the other runs
+	 * under restrictions that are slightly relaxed. It's likely
+	 * that servers probed in these two passes are not the same.
 	 */
-	while (srv_tried_n < ratio->srv_n) {
+	attempts = ratio->srv_n * 2 + 1;
+	while (--attempts) {
 		srvdesc = tfw_sched_ratio_next_srv(ratio);
 		if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn))) {
 			rcu_read_unlock();
 			return srv_conn;
 		}
-		if (srvdesc != srvdesc_last)
-			++srv_tried_n;
 	}
 	/* Relax the restrictions and re-run the search cycle. */
 	if (skipnip && nipconn) {

From 4340f5dd9eb96d63ecd3ff722f1d31001ccc801e Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 18 Apr 2017 12:24:33 +0300
Subject: [PATCH 19/37] Predictive Scheduler implementation (simple linear
 regression).

---
 README.md                           |  21 +-
 tempesta_fw/sched/tfw_sched_hash.c  |   2 +-
 tempesta_fw/sched/tfw_sched_ratio.c | 471 +++++++++++++++++++++-------
 tempesta_fw/server.c                |   2 +-
 tempesta_fw/server.h                |  17 +-
 tempesta_fw/sock_srv.c              | 356 ++++++++++++++-------
 6 files changed, 630 insertions(+), 239 deletions(-)

diff --git a/README.md b/README.md
index 2d6dbc53f..e862c29ea 100644
--- a/README.md
+++ b/README.md
@@ -552,10 +552,20 @@ Specific type of dynamic weight is specified with additional options:
     95, 99. If none is given, then the default percentile of 90 is used.
 If a specific type of dynamic weight is not specified, then the default type
 of `average` is used.
-
-Naturally, if a dynamic scheduler is specified for a group, and there's
+* **predict** - The weight of each server in a group is predicted dynamically
+for a time in the future, based on server's behavior in the past. Additional
+options include those that are defined for **dynamic** weight, as well as
+the following options:
+    * **past** - Period of time (in seconds) to keep past response time
+    values from a server. The default value is 30 seconds.
+    * **rate** - Rate (times per second) of retrieval of past response time
+    values. The default value is 20 times per second.
+    * **ahead** - Period of time (in seconds) for which to make a prediction;
+    It can't be more than half of **past**. The default value is 15 seconds.
+
+Naturally, if a Dynamic Scheduler is specified for a group, and there's
 a server in that group with the `weight` option, then an error is produced
-as that combination is incompatible.
+as that combination is incompatible. Same is true for Predictive Scheduler.
 
 The following are examples of scheduler specification in configuration.
 Again, only one `sched` directive is allowed per group.
@@ -575,6 +585,11 @@ sched dynamic maximum;
 sched dynamic percentile;
 # Use dynamic scheduler, percentile of 75 is used for weight distribution.
 sched dynamic percentile 75;
+# Use predictive scheduler, percentile of 75 is used for weight distribution.
+# The values of weights of each server are collected for past 60 seconds
+# at the rate of 20 times per second, the weight of each server in predicted
+# for the time of 2 seconds ahead.
+sched predict percentile 75 past=60 rate=20 ahead=2;
 ```
 
 Servers should be grouped together with proper care. Server groups should
diff --git a/tempesta_fw/sched/tfw_sched_hash.c b/tempesta_fw/sched/tfw_sched_hash.c
index 55b40b8c0..083c75f16 100644
--- a/tempesta_fw/sched/tfw_sched_hash.c
+++ b/tempesta_fw/sched/tfw_sched_hash.c
@@ -264,7 +264,7 @@ tfw_sched_hash_add_grp(TfwSrvGroup *sg)
 	TfwHashSrv *hsrv;
 	TfwHashSrvList *sl;
 
-	if (!tfw_sched_hash_validate_grp(sg))
+	if (tfw_sched_hash_validate_grp(sg))
 		return -EINVAL;
 
 	size = sizeof(TfwHashSrvList) + sizeof(TfwHashSrv) * sg->srv_n;
diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 2163bd311..64cb577df 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -91,6 +91,57 @@ typedef struct {
 	unsigned long	orsum;
 } TfwRatioSchData;
 
+/**
+ * Historic (past) data unit for an individual upstream server.
+ *
+ * @x		- count of timer function invocations.
+ * @y		- RTT from APM in msecs.
+ */
+typedef struct {
+	unsigned long	x;
+	unsigned long	y;
+} TfwRatioHstXY;
+
+/**
+ * Historic (past) data set for an individual upstream server.
+ * This is the data set for simple linear regression calculation.
+ *
+ * @a		- coefficient for y = a + b * x + eps.
+ * @b		- coefficient for y = a + b * x + eps.
+ * @x_avg	- average x value.
+ * @y_avg	- average y value.
+ * @xy_avg	- avg(x * y).
+ * @x_avg_y_avg	- avg(x) * avg(y).
+ * @x_sq_avg	- avg(x * x).
+ * @x_avg_sq	- avg(x) * avg(x).
+ */
+typedef struct {
+	long		a;
+	long		b;
+	long		x_avg;
+	long		y_avg;
+	long		xy_avg;
+	long		x_avg_y_avg;
+	long		x_sq_avg;
+	long		x_avg_sq;
+	TfwRatioHstXY	*hist;
+} TfwRatioHstDesc;
+
+/**
+ * Historic (past) data for predictive scheduler.
+ *
+ * @ahead	- predict for this number of @intvl ahead.
+ * @past_sz	- total number of slots for past data.
+ * @counter	- slot that is available for storing past data.
+ * @past	- past data for each server (@past[@srv_n]).
+ */
+typedef struct {
+	unsigned int	ahead;
+	size_t		past_sz;
+	unsigned long	counter;
+	TfwRatioHstDesc	*past;
+} TfwRatioHstData;
+
 /**
  * The main Ratio Scheduler structure.
  *
@@ -99,20 +150,12 @@ typedef struct {
  *
  * @rcu		- RCU control structure;
  * @free	- indicates that the pool entry is available for use.
- * @srv_n	- number of upstream servers.
- * @psidx	- APM pstats[] value index for dynamic ratios.
- * @sched	- scheduler data.
- * @srvdesc	- array of upstream server descriptors, shared between
- *		  RCU pool entries.
  * @srvdata	- scheduler data specific to each server in the group.
  * @schdata	- scheduler data common to all servers in the group.
  */
 typedef struct {
 	struct rcu_head		rcu;
 	atomic_t		free;
-	size_t			srv_n;
-	size_t			psidx;
-	TfwRatioSrvDesc		*srvdesc;
 	TfwRatioSrvData		*srvdata;
 	TfwRatioSchData		schdata;
 } TfwRatio;
@@ -120,14 +163,24 @@ typedef struct {
 /**
  * The pool of TfwRatio{} structures for RCU.
  *
- * @pool	- pool of TfwRatio{} for RCU.
+ * @srv_n	- number of upstream servers.
+ * @psidx	- APM pstats[] value index for dynamic ratios.
+ * @intvl	- interval for re-arming the timer.
+ * @rpool	- pool of TfwRatio{} for RCU.
  * @ratio	- pointer to the currently used structure.
+ * @hstdata	- historic data for predictive scheduler.
+ * @srvdesc	- array of upstream server descriptors.
  * @rearm	- indicates if the timer can be re-armed.
  * @timer	- periodic timer for dynamic APM data.
  */
 typedef struct {
+	size_t			srv_n;
+	size_t			psidx;
+	unsigned int		intvl;
 	TfwRatio		*rpool;
 	TfwRatio __rcu		*ratio;
+	TfwRatioHstData		*hstdata;
+	TfwRatioSrvDesc		*srvdesc;
 	atomic_t		rearm;
 	struct timer_list	timer;
 } TfwRatioPool;
@@ -169,7 +222,7 @@ tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs)
  * Return a non-zero value if additional actions are needed.
  */
 static int
-tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
+tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio, size_t *arg_mvidx)
 {
 	size_t si, max_val_idx;
 	unsigned int diff, max_wgt, oratio;
@@ -186,7 +239,7 @@ tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
 	 * the group are the same.
 	 */
 	diff = max_val_idx = 0;
-	for (si = 0; si < ratio->srv_n; ++si) {
+	for (si = 0; si < rpool->srv_n; ++si) {
 		if (srvdata[max_val_idx].weight < srvdata[si].weight)
 			max_val_idx = si;
 		sum_wgt += srvdata[si].weight;
@@ -196,16 +249,16 @@ tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
 	/* Set up the common part of scheduler data. */
 	schdata->csidx = 0;
 	schdata->riter = 1;
-	schdata->reidx = ratio->srv_n;
+	schdata->reidx = rpool->srv_n;
 
 	/*
 	 * If all server weights are the same, then there's no need to do
 	 * anything else. Set up all ratios to 1 and be done with it.
 	 */
 	if (!diff) {
-		for (si = 0; si < ratio->srv_n; ++si)
+		for (si = 0; si < rpool->srv_n; ++si)
 			srvdata[si].cratio = srvdata[si].oratio = 1;
-		schdata->crsum = schdata->orsum = ratio->srv_n;
+		schdata->crsum = schdata->orsum = rpool->srv_n;
 		return 0;
 	}
 
@@ -214,8 +267,8 @@ tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
 	 * if all calculated ratios are the same. Set up scheduler data.
 	 */
 	max_wgt = srvdata[max_val_idx].weight;
-	unit = ((max_wgt + ratio->srv_n) * max_wgt) / sum_wgt;
-	for (si = 0; si < ratio->srv_n; ++si) {
+	unit = ((max_wgt + rpool->srv_n) * max_wgt) / sum_wgt;
+	for (si = 0; si < rpool->srv_n; ++si) {
 		oratio = (unit * srvdata[si].weight) / max_wgt ? : 1;
 		srvdata[si].cratio = srvdata[si].oratio = oratio;
 		diff |= (oratio != srvdata[0].oratio);
@@ -224,7 +277,7 @@ tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
 	schdata->crsum = schdata->orsum = sum_ratio;
 
 	/* Return the index of server data entry with maximum ratio. */
-	*arg_max_val_idx = max_val_idx;
+	*arg_mvidx = max_val_idx;
 
 	return diff;
 }
@@ -234,31 +287,29 @@ tfw_sched_ratio_calc(TfwRatio *ratio, size_t *arg_max_val_idx)
  * weights that are statically defined in the configuration file.
  */
 static void
-tfw_sched_ratio_calc_static(TfwRatio *ratio)
+tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
 {
 	size_t si, max_val_idx = 0;
-	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
+	TfwRatioSrvDesc *srvdesc = rpool->srvdesc;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 
 	/* Collect server weights from the configuration. */
-	for (si = 0; si < ratio->srv_n; ++si) {
+	for (si = 0; si < rpool->srv_n; ++si) {
 		srvdata[si].sdidx = si;
 		srvdata[si].weight = srvdesc[si].srv->weight;
 	}
 
 	/* Calculate ratios based on server weights. */
-	if (!tfw_sched_ratio_calc(ratio, &max_val_idx))
+	if (!tfw_sched_ratio_calc(rpool, ratio, &max_val_idx))
 		return;
 
 	/* Sort server data entries by ratio in descending order. */
-	sort(srvdata, ratio->srv_n, sizeof(srvdata[0]),
+	sort(srvdata, rpool->srv_n, sizeof(srvdata[0]),
 	     tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap);
 }
 
 /**
  * Calculate ratios for each server in a group based on dynamic data.
- * the function runs periodically on timer and provides the data that
- * is used by the ratio scheduler for outgoing requests.
  *
  * Latest dynamic data is provided by APM module and represent RTT values
  * for each server in a group. Ratios are calculated on those RTT values.
@@ -277,56 +328,17 @@ tfw_sched_ratio_calc_static(TfwRatio *ratio)
  *    Those are entries at the start and at the end of the array. Reverse
  *    the sequence of server descriptor indices in that part of the array.
  *    The resulting pairing of servers to ratios is the target.
- *
- * Return 0 if there are no new ratio values.
- * Return a non-zero value if new ratio values were calculated.
  */
-static int
-tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
+static void
+__tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
 {
 	size_t si, max_val_idx = 0, left = 0, right = 0;
-	unsigned int recalc = 0, max_ratio = 0, has_one_val = 0;
-	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
-	TfwPrcntlStats pstats = {
-		.ith = tfw_pstats_ith,
-		.val = val,
-		.psz = ARRAY_SIZE(tfw_pstats_ith)
-	};
+	unsigned int max_ratio = 0, has_one_val = 0;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
-	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
-
-	/*
-	 * Collect server RTT values from APM module. See if APM may have
-	 * provided new data, and a recalculation is required. Otherwise
-	 * there's nothing to do.
-	 *
-	 * TODO: The following cases should be considered.
-	 * 1. APM recalculates the stats on each request-response pair.
-	 *    It's quite possible that the actual stats values did not
-	 *    change. However, the APM doesn't know of that and reports
-	 *    that the values may have changed. It would be great to
-	 *    catch that and avoid the recalculation of ratios.
-	 * 2. Depending on actual RTT values a small deviation from the
-	 *    previous value should be acceptable. It should not cause
-	 *    a recalculation of ratio.
-	 * 3. Finally, a typical case is that only a handful of servers
-	 *    misbehave in a large group of servers. Is there a way to
-	 *    detect that and do a partial recalculation of ratios?
-	 */
-	for (si = 0; si < ratio->srv_n; ++si) {
-		pstats.seq = srvdesc[si].seq;
-		recalc |= tfw_apm_stats(srvdesc[si].srv->apm, &pstats);
-		srvdesc[si].seq = pstats.seq;
-
-		srvdata[si].sdidx = si;
-		srvdata[si].weight = pstats.val[ratio->psidx] ? : 1;
-	}
-	if (!recalc)
-		return 0;
 
 	/* Calculate ratios based on server RTT values. */
-	if (!tfw_sched_ratio_calc(ratio, &max_val_idx))
-		return 1;
+	if (!tfw_sched_ratio_calc(rpool, ratio, &max_val_idx))
+		return;
 
 	/*
 	 * It's guaranteed here that NOT all calculated ratio values are
@@ -334,7 +346,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 	 * do actions described in step 1 in the function's description.
 	 * Adjust the sum of ratios that is changed in this procedure.
 	 */
-	for (si = 0; si < ratio->srv_n; ++si) {
+	for (si = 0; si < rpool->srv_n; ++si) {
 		if (srvdata[si].oratio == 1) {
 			has_one_val = 1;
 			break;
@@ -348,7 +360,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 		/* Save maximum ratio value for future use. */
 		max_ratio = srvdata[max_val_idx].oratio;
 
-		for (si = 0; si < ratio->srv_n; ++si) {
+		for (si = 0; si < rpool->srv_n; ++si) {
 			if (srvdata[si].oratio == 1) {
 				srvdata[si].weight = sdent_max.weight;
 				srvdata[si].oratio =
@@ -365,7 +377,7 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 	}
 
 	/* Sort server data entries by ratio in descending order. */
-	sort(srvdata, ratio->srv_n, sizeof(srvdata[0]),
+	sort(srvdata, rpool->srv_n, sizeof(srvdata[0]),
 	     tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap);
 
 	/*
@@ -377,9 +389,9 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 	 */
 	if (!has_one_val) {
 		left = 0;
-		right = ratio->srv_n - 1;
+		right = rpool->srv_n - 1;
 	} else {
-		for (si = 0; si < ratio->srv_n; ++si)
+		for (si = 0; si < rpool->srv_n; ++si)
 			if (srvdata[si].oratio == max_ratio) {
 				left = si + 1;
 			} else if (srvdata[si].oratio == 1) {
@@ -393,6 +405,158 @@ tfw_sched_ratio_calc_dynamic(TfwRatio *ratio)
 		srvdata[right--].sdidx = left_sdidx;
 	}
 
+	return;
+}
+
+/**
+ * Fill scheduler's ratio entry with APM data for each server.
+ *
+ * Return 0 if there is no new APM data.
+ * Return a non-zero value otherwise.
+ */
+static int
+tfw_sched_ratio_fill_apmdata(TfwRatioPool *rpool, TfwRatio *ratio)
+{
+	size_t si;
+	unsigned int recalc = 0;
+	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
+	TfwPrcntlStats pstats = {
+		.ith = tfw_pstats_ith,
+		.val = val,
+		.psz = ARRAY_SIZE(tfw_pstats_ith)
+	};
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSrvDesc *srvdesc = rpool->srvdesc;
+
+	/*
+	 * Collect server RTT values from APM module. See if APM may have
+	 * provided new data, and a recalculation is required. Otherwise
+	 * there's nothing to do.
+	 *
+	 * TODO: The following cases should be considered.
+	 * 1. APM recalculates the stats on each request-response pair.
+	 *    It's quite possible that the actual stats values did not
+	 *    change. However, the APM doesn't know of that and reports
+	 *    that the values may have changed. It would be great to
+	 *    catch that and avoid the recalculation of ratios.
+	 * 2. Depending on actual RTT values a small deviation from the
+	 *    previous value should be acceptable. It should not cause
+	 *    a recalculation of ratio.
+	 * 3. Finally, a typical case is that only a handful of servers
+	 *    misbehave in a large group of servers. Is there a way to
+	 *    detect that and do a partial recalculation of ratios?
+	 */
+	for (si = 0; si < rpool->srv_n; ++si) {
+		pstats.seq = srvdesc[si].seq;
+		recalc |= tfw_apm_stats(srvdesc[si].srv->apm, &pstats);
+		srvdesc[si].seq = pstats.seq;
+
+		srvdata[si].sdidx = si;
+		srvdata[si].weight = pstats.val[rpool->psidx] ? : 1;
+	}
+
+	return recalc;
+}
+
+/**
+ * Calculate ratios for each server in a group based on dynamic data.
+ * Latest dynamic data is provided by APM module and represent RTT values
+ * for each server in a group. Ratios are calculated on those RTT values.
+ *
+ * The function runs periodically on timer and provides the data that
+ * is used by the ratio scheduler for outgoing requests.
+ *
+ * Return 0 if there are no new ratio values.
+ * Return a non-zero value if new ratio values were calculated.
+ */
+static int
+tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
+{
+	if (!tfw_sched_ratio_fill_apmdata(rpool, ratio))
+		return 0;
+	__tfw_sched_ratio_calc_dynamic(rpool, ratio);
+	return 1;
+}
+
+/**
+ * Calculate ratios for each server in a group based on predicted values
+ * derived from dynamic data. The dynamic data is provided by APM module
+ * and represent RTT values for each server in a group. The RTT values
+ * are collected within a latest period of time (time window) and then
+ * used to predict the future RTT values that will be in action until
+ * the next run of this function. Server ratios are calculated on those
+ * predicted RTT values.
+ *
+ * A simple linear regression calculation on a sliding data window is
+ * used to predict future RTT values for each server. @y is an RTT value
+ * from APM, and @x is the current number of invocations of this timer
+ * function (every @intvl msecs). Essentially @x is a measure of time.
+ *
+ * The function runs periodically on timer and provides the data that
+ * is used by the ratio scheduler for outgoing requests.
+ *
+ * Return 0 if there are no new ratio values.
+ * Return a non-zero value if new ratio values were calculated.
+ */
+static int
+tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
+{
+	TfwRatioHstData *hstdata = rpool->hstdata;
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+	static const long MUL = 1000;
+	unsigned long x = hstdata->counter * MUL;
+	size_t si, sz, ni;
+
+	tfw_sched_ratio_fill_apmdata(rpool, ratio);
+
+	ni = hstdata->counter % hstdata->past_sz;
+
+	for (si = 0; si < rpool->srv_n; ++si) {
+		unsigned long y = srvdata[si].weight * MUL;
+		TfwRatioHstDesc *hd = &hstdata->past[si];
+
+		if (unlikely(hstdata->counter < hstdata->past_sz)) {
+			sz = ni + 1;
+			hd->x_avg = (hd->x_avg * ni + x) / sz;
+			hd->y_avg = (hd->y_avg * ni + y) / sz;
+			hd->xy_avg = (hd->xy_avg * ni + x * y) / sz;
+			hd->x_avg_y_avg = hd->x_avg * hd->y_avg;
+			hd->x_sq_avg = (hd->x_sq_avg * ni + x * x) / sz;
+			hd->x_avg_sq = hd->x_avg * hd->x_avg;
+		} else {
+			unsigned long h_x = hd->hist[ni].x;
+			unsigned long h_y = hd->hist[ni].y;
+			sz = hstdata->past_sz;
+			hd->x_avg = hd->x_avg - (h_x - x) / sz;
+			hd->y_avg = hd->y_avg - (h_y - y) / sz;
+			hd->xy_avg = hd->xy_avg - (h_x * h_y - x * y) / sz;
+			hd->x_avg_y_avg = hd->x_avg * hd->y_avg;
+			hd->x_sq_avg = hd->x_sq_avg - (h_x * h_x - x * x) / sz;
+			hd->x_avg_sq = hd->x_avg * hd->x_avg;
+		}
+
+		hd->hist[ni].x = x;
+		hd->hist[ni].y = y;
+
+		if (hd->x_sq_avg == hd->x_avg_sq) {
+			hd->a = 0;
+			hd->b = hd->x_avg ? hd->y_avg / hd->x_avg : 1;
+		} else {
+			hd->b = (hd->xy_avg - hd->x_avg_y_avg)
+				/ (hd->x_sq_avg - hd->x_avg_sq);
+			hd->a = (hd->y_avg - hd->b * hd->x_avg) / MUL;
+		}
+	}
+
+	x = hstdata->counter + hstdata->ahead;
+	for (si = 0; si < rpool->srv_n; ++si) {
+		TfwRatioHstDesc *hd = &hstdata->past[si];
+		long prediction = hd->a + hd->b * x;
+		srvdata[si].weight = prediction <= 0 ? 1 : prediction;
+	}
+	++hstdata->counter;
+
+	__tfw_sched_ratio_calc_dynamic(rpool, ratio);
 	return 1;
 }
 
@@ -406,9 +570,10 @@ tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
 	TfwRatio *ratio = rpool->rpool;
 
 	for (si = 0; si <= nr_cpu_ids; ++si, ++ratio) {
-		smp_mb__before_atomic();
+		smp_mb();
 		if (atomic_read(&ratio->free)) {
 			atomic_set(&ratio->free, 0);
+			smp_mb__after_atomic();
 			return ratio;
 		}
 	}
@@ -449,12 +614,12 @@ tfw_sched_ratio_rpool_put(struct rcu_head *rcup)
  * is chosen as one more than the number of CPU slots in the system.
  */
 static void
-tfw_sched_ratio_tmfn(unsigned long tmfn_data)
+tfw_sched_ratio_calc_tmfn(TfwSrvGroup *sg,
+			  int (*calc_fn)(TfwRatioPool *, TfwRatio *))
 {
-	TfwSrvGroup *sg = (TfwSrvGroup *)tmfn_data;
 	TfwRatioPool *rpool = sg->sched_data;
 	TfwRatio *cratio, *nratio;
-	int interval = TFW_SCHED_RATIO_INTVL;
+	int interval = rpool->intvl;
 
 	/*
 	 * Get an available ratio entry from the RCU pool. If there's
@@ -471,7 +636,7 @@ tfw_sched_ratio_tmfn(unsigned long tmfn_data)
 	 * Calculate dynamic ratios. If there's nothing to do, then
 	 * return the ratio entry back to the RCU pool.
 	 */
-	if (!tfw_sched_ratio_calc_dynamic(nratio)) {
+	if (!calc_fn(rpool, nratio)) {
 		__tfw_sched_ratio_rpool_put(nratio);
 		goto rearm;
 	}
@@ -486,11 +651,31 @@ tfw_sched_ratio_tmfn(unsigned long tmfn_data)
 	call_rcu(&cratio->rcu, tfw_sched_ratio_rpool_put);
 
 rearm:
-	smp_mb__before_atomic();
+	smp_mb();
 	if (atomic_read(&rpool->rearm))
 		mod_timer(&rpool->timer, jiffies + interval);
 }
 
+/**
+ * Periodic function for Dynamic Ratio Scheduler.
+ */
+static void
+tfw_sched_ratio_dynamic_tmfn(unsigned long tmfn_data)
+{
+	tfw_sched_ratio_calc_tmfn((TfwSrvGroup *)tmfn_data,
+				   tfw_sched_ratio_calc_dynamic);
+}
+
+/**
+ * Periodic function for Predictive Ratio Scheduler.
+ */
+static void
+tfw_sched_ratio_predict_tmfn(unsigned long tmfn_data)
+{
+	tfw_sched_ratio_calc_tmfn((TfwSrvGroup *)tmfn_data,
+				   tfw_sched_ratio_calc_predict);
+}
+
 /*
  * Determine if it's the turn of the server described by the server
  * data entry at index @csidx.
@@ -511,7 +696,7 @@ tfw_sched_ratio_tmfn(unsigned long tmfn_data)
  * TODO: The algorithm may and should be improved.
  */
 static inline bool
-tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, size_t csidx)
+tfw_sched_ratio_is_srv_turn(TfwRatioPool *rpool, TfwRatio *ratio, size_t csidx)
 {
 	unsigned long headsum2, tailsum2;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
@@ -522,9 +707,9 @@ tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, size_t csidx)
 
 	headsum2 = (srvdata[0].cratio + srvdata[csidx - 1].cratio) * csidx;
 	tailsum2 = (srvdata[csidx].cratio
-		    + (srvdata[ratio->srv_n - 1].cratio
-		       ? : srvdata[ratio->srv_n - 1].oratio))
-		   * (ratio->srv_n - csidx);
+		    + (srvdata[rpool->srv_n - 1].cratio
+		       ? : srvdata[rpool->srv_n - 1].oratio))
+		   * (rpool->srv_n - csidx);
 
 	return tailsum2 * schdata->riter > headsum2;
 }
@@ -542,7 +727,7 @@ tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, size_t csidx)
  * that it won't give any advantage.
  */
 static TfwRatioSrvDesc *
-tfw_sched_ratio_next_srv(TfwRatio *ratio)
+tfw_sched_ratio_next_srv(TfwRatioPool *rpool, TfwRatio *ratio)
 {
 	size_t csidx;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
@@ -564,7 +749,7 @@ tfw_sched_ratio_next_srv(TfwRatio *ratio)
 		 */
 		if (schdata->reidx != csidx) {
 			++schdata->csidx;
-			if (schdata->csidx == ratio->srv_n) {
+			if (schdata->csidx == rpool->srv_n) {
 				schdata->csidx = 0;
 				schdata->riter = 1;
 			}
@@ -583,20 +768,20 @@ tfw_sched_ratio_next_srv(TfwRatio *ratio)
 	 * the group, then also start from the beginning, but do not
 	 * reset as it's been reset already (make sure of that).
 	 */
-	if (likely(tfw_sched_ratio_is_srv_turn(ratio, csidx))) {
+	if (likely(tfw_sched_ratio_is_srv_turn(rpool, ratio, csidx))) {
 		--srvdata[csidx].cratio;
 		if (unlikely(!--schdata->crsum)) {
 			schdata->csidx = 0;
 			schdata->riter = 1;
 			schdata->crsum = schdata->orsum;
 			schdata->reidx = 0;
-		} else if (unlikely(++schdata->csidx == ratio->srv_n)) {
-			BUG_ON(schdata->reidx != ratio->srv_n);
+		} else if (unlikely(++schdata->csidx == rpool->srv_n)) {
+			BUG_ON(schdata->reidx != rpool->srv_n);
 			schdata->csidx = 0;
 			schdata->riter = 1;
 		}
 		spin_unlock(&schdata->lock);
-		return ratio->srvdesc + srvdata[csidx].sdidx;
+		return rpool->srvdesc + srvdata[csidx].sdidx;
 	}
 	/*
 	 * This is not the turn of the current server. Start
@@ -609,6 +794,16 @@ tfw_sched_ratio_next_srv(TfwRatio *ratio)
 	spin_unlock(&schdata->lock);
 }
 
+/*
+ * Find an available connection to the server described by @srvdesc.
+ * Consider the following restrictions:
+ * 1. connection is not in recovery mode.
+ * 2. connection's queue is not be full.
+ * 3. connection doesn't have active non-idempotent requests.
+ *
+ * The restriction #3 is controlled by @skipnip and can be removed
+ * to get a wider selection of available connections.
+ */
 static inline TfwSrvConn *
 __sched_srv(TfwRatioSrvDesc *srvdesc, int skipnip, int *nipconn)
 {
@@ -718,9 +913,9 @@ tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 	 * under restrictions that are slightly relaxed. It's likely
 	 * that servers probed in these two passes are not the same.
 	 */
-	attempts = ratio->srv_n * 2 + 1;
+	attempts = rpool->srv_n * 2 + 1;
 	while (--attempts) {
-		srvdesc = tfw_sched_ratio_next_srv(ratio);
+		srvdesc = tfw_sched_ratio_next_srv(rpool, ratio);
 		if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn))) {
 			rcu_read_unlock();
 			return srv_conn;
@@ -743,24 +938,29 @@ static void
 tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
 {
 	size_t si;
-	TfwRatio *ratio;
 	TfwRatioPool *rpool = sg->sched_data;
 
 	if (!rpool)
 		return;
 
 	/* Free the data that is shared between pool entries. */
-	ratio = rpool->rpool;
 	for (si = 0; si < sg->srv_n; ++si)
-		if (ratio->srvdesc[si].conns)
-			kfree(ratio->srvdesc[si].conns);
-	kfree(ratio->srvdesc);
+		if (rpool->srvdesc[si].conns)
+			kfree(rpool->srvdesc[si].conns);
+	kfree(rpool->srvdesc);
 
 	/* Free the data that is unique for each pool entry. */
-	ratio = rpool->rpool;
 	for (si = 0; si <= nr_cpu_ids; ++si)
-		if (ratio[si].srvdata)
-			kfree(ratio[si].srvdata);
+		if (rpool->rpool[si].srvdata)
+			kfree(rpool->rpool[si].srvdata);
+
+	/* Free the data allocated for predictive scheduler. */
+	if (rpool->hstdata) {
+		for (si = 0; si < sg->srv_n; ++si)
+			if (rpool->hstdata->past[si].hist)
+				kfree(rpool->hstdata->past[si].hist);
+		kfree(rpool->hstdata);
+	}
 
 	kfree(rpool);
 	sg->sched_data = NULL;
@@ -774,7 +974,9 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 {
 	TfwRatioPool *rpool = sg->sched_data;
 
-	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
+	if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
+			 | TFW_SG_F_SCHED_RATIO_PREDICT))
+	{
 		atomic_set(&rpool->rearm, 0);
 		smp_mb__after_atomic();
 		del_timer_sync(&rpool->timer);
@@ -790,7 +992,7 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
  * of connections for each server match the recorded values.
  */
 static int
-tfw_sched_hash_validate_grp(TfwSrvGroup *sg)
+tfw_sched_ratio_validate_grp(TfwSrvGroup *sg)
 {
 	size_t si = 0, ci;
 	TfwServer *srv;
@@ -815,6 +1017,9 @@ tfw_sched_hash_validate_grp(TfwSrvGroup *sg)
  *
  * At the time this function is called the server group is fully formed
  * and populated with all servers and connections.
+ *
+ * Additional configuration data required for Predictive scheduler are
+ * passed via @sg->sched_data.
  */
 static int
 tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
@@ -825,9 +1030,12 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 	TfwSrvConn *srv_conn;
 	TfwRatio *ratio;
 	TfwRatioPool *rpool;
-	TfwRatioSrvDesc *trsdesc, *srvdesc;
+	TfwRatioSrvDesc *srvdesc;
+	void *sched_data = sg->sched_data;
+
+	sg->sched_data = NULL;
 
-	if (!tfw_sched_hash_validate_grp(sg))
+	if (tfw_sched_ratio_validate_grp(sg))
 		return -EINVAL;
 
 	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
@@ -840,9 +1048,10 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 
 	/* Array for server descriptors. Shared between RCU pool entries. */
 	size = sizeof(TfwRatioSrvDesc) * sg->srv_n;
-	if (!(trsdesc = kzalloc(size, GFP_KERNEL)))
+	if (!(rpool->srvdesc = kzalloc(size, GFP_KERNEL)))
 		goto cleanup;
-	rpool->rpool[0].srvdesc = trsdesc;
+	rpool->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
+	rpool->srv_n = sg->srv_n;
 
 	/* Set up each RCU pool entry with required arrays and data. */
 	size = sizeof(TfwRatioSrvData) * sg->srv_n;
@@ -850,14 +1059,11 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 		if (!(ratio->srvdata = kzalloc(size, GFP_KERNEL)))
 			goto cleanup;
 		spin_lock_init(&ratio->schdata.lock);
-		ratio->srvdesc = trsdesc;
-		ratio->srv_n = sg->srv_n;
-		ratio->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
 		atomic_set(&ratio->free, 1);
 	}
 
 	/* Initial setup of upstream server descriptors. */
-	srvdesc = trsdesc;
+	srvdesc = rpool->srvdesc;
 	list_for_each_entry(srv, &sg->srv_list, list) {
 		size = sizeof(TfwSrvConn *) * srv->conn_n;
 		if (!(srvdesc->conns = kzalloc(size, GFP_KERNEL)))
@@ -872,21 +1078,58 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 		++srvdesc;
 	}
 
+	/* Set up the necessary workspace for predictive scheduler. */
+	if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) {
+		TfwRatioHstData *hstdata;
+		TfwSchrefPredict *schref = sched_data;
+		BUG_ON(!schref);
+		size = sizeof(TfwRatioHstData)
+		       + sizeof(TfwRatioHstDesc) * sg->srv_n;
+		if (!(rpool->hstdata = kzalloc(size, GFP_KERNEL)))
+			goto cleanup;
+		hstdata = rpool->hstdata;
+		hstdata->past = (TfwRatioHstDesc *)(hstdata + 1);
+		hstdata->past_sz = schref->past * schref->rate;
+		hstdata->ahead = schref->ahead * schref->rate;
+		size = sizeof(TfwRatioHstXY) * hstdata->past_sz;
+		for (si = 0; si < sg->srv_n; ++si) {
+			TfwRatioHstDesc *hd = &hstdata->past[si];
+			if (!(hd->hist = kzalloc(size, GFP_KERNEL)))
+				goto cleanup;
+		}
+	}
+
 	/*
 	 * Set up the initial ratio data. For dynamic ratios it's all
 	 * equal initial weights.
 	 */
 	if (!(sg->flags & (TFW_SG_F_SCHED_RATIO_STATIC
-			   | TFW_SG_F_SCHED_RATIO_DYNAMIC)))
-		BUG();
+			   | TFW_SG_F_SCHED_RATIO_DYNAMIC
+			   | TFW_SG_F_SCHED_RATIO_PREDICT)))
+	{
+		ret = -EINVAL;
+		goto cleanup;
+	}
 
-	tfw_sched_ratio_calc_static(rpool->ratio);
+	/* Calculate initial ratios for each server. */
+	tfw_sched_ratio_calc_static(rpool, rpool->ratio);
 
+	/* Set up periodic re-calculation of ratios. */
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
+		rpool->intvl = TFW_SCHED_RATIO_INTVL;
 		atomic_set(&rpool->rearm, 1);
+		smp_mb__after_atomic();
+		setup_timer(&rpool->timer,
+			    tfw_sched_ratio_dynamic_tmfn, (unsigned long)sg);
+		mod_timer(&rpool->timer, jiffies + rpool->intvl);
+	} else if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) {
+		TfwSchrefPredict *schref = sched_data;
+		rpool->intvl = msecs_to_jiffies(1000 / schref->rate);
+		atomic_set(&rpool->rearm, 1);
+		smp_mb__after_atomic();
 		setup_timer(&rpool->timer,
-			    tfw_sched_ratio_tmfn, (unsigned long)sg);
-		mod_timer(&rpool->timer, jiffies + TFW_SCHED_RATIO_INTVL);
+			    tfw_sched_ratio_predict_tmfn, (unsigned long)sg);
+		mod_timer(&rpool->timer, jiffies + rpool->intvl);
 	}
 
 	return 0;
diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c
index 3ae4a4608..3fdcd193f 100644
--- a/tempesta_fw/server.c
+++ b/tempesta_fw/server.c
@@ -117,7 +117,7 @@ tfw_sg_new(const char *name, gfp_t flags)
 
 	TFW_DBG("new server group: '%s'\n", name);
 
-	sg = kmalloc(sizeof(*sg) + name_size, flags | __GFP_ZERO);
+	sg = kzalloc(sizeof(*sg) + name_size, flags);
 	if (!sg)
 		return NULL;
 
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index 5e487369e..325560799 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -53,8 +53,8 @@ typedef struct {
 	TfwSrvGroup		*sg;
 	void			*sched_data;
 	void			*apm;
-	int			weight;
-	int			conn_n;
+	unsigned int		weight;
+	size_t			conn_n;
 } TfwServer;
 
 /**
@@ -92,7 +92,18 @@ struct tfw_srv_group_t {
 	char			name[0];
 };
 
-/* Server related flags.
+/**
+ * @past	- period of time (secs) to keep past APM values;
+ * @rate	- rate (times per sec) of retrieval of past APM values;
+ * @ahead	- period of time (secs) for a prediction;
+ */
+typedef struct {
+	unsigned int		past;
+	unsigned int		rate;
+	unsigned int		ahead;
+} TfwSchrefPredict;
+
+/* Server and server group related flags.
  * Lower 4 bits keep an index into APM stats array.
  */
 #define TFW_SG_F_PSTATS_IDX_MASK	0x000f
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index a261e0779..aea60c32d 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -566,51 +566,28 @@ tfw_sock_srv_delete_all_conns(void)
  *	Configuration handling
  * ------------------------------------------------------------------------
  */
-
-/*
- * Default values for various configuration directives and options.
- */
-#define TFW_CFG_SRV_CONNS_N_DEF		32	/* Default # of connections */
-#define TFW_CFG_SRV_QUEUE_SIZE_DEF	1000	/* Max queue size */
-#define TFW_CFG_SRV_FWD_TIMEOUT_DEF	60	/* Default request timeout */
-#define TFW_CFG_SRV_FWD_RETRIES_DEF	5	/* Default number of tries */
-#define TFW_CFG_SRV_CNS_RETRIES_DEF	10	/* Reconnect tries. */
-#define TFW_CFG_SRV_RETRY_NIP_DEF	0	/* Do NOT resend NIP reqs */
-#define TFW_CFG_SRV_STICKY_SESS_DEF	0	/* Don't use sticky sessions */
-#define TFW_CFG_SRV_WEIGHT_MIN		1
-#define TFW_CFG_SRV_WEIGHT_MAX		100
-#define TFW_CFG_SRV_WEIGHT_DEF		50
-#define TFW_CFG_SG_NAME_DEF		"default"
+#define TFW_CFG_DFLT_VAL	"__dfltval__"	/* Use a default value. */
 
 static struct list_head tfw_cfg_in_slst = LIST_HEAD_INIT(tfw_cfg_in_slst);
 static struct list_head tfw_cfg_out_slst = LIST_HEAD_INIT(tfw_cfg_out_slst);
 static struct list_head *tfw_cfg_slst;
 static int tfw_cfg_slstsz, tfw_cfg_out_slstsz;
 static TfwScheduler *tfw_cfg_sched, *tfw_cfg_out_sched;
+static TfwSchrefPredict tfw_cfg_schref_predict, tfw_cfg_out_schref_predict;
+static void *tfw_cfg_schref, *tfw_cfg_out_schref;
 static TfwSrvGroup *tfw_cfg_sg, *tfw_cfg_out_sg;
 
-static int tfw_cfg_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF;
-static int tfw_cfg_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF;
-static int tfw_cfg_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF;
-static int tfw_cfg_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF;
-static unsigned int tfw_cfg_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF;
-static unsigned int tfw_cfg_sticky_sess = TFW_CFG_SRV_STICKY_SESS_DEF;
-
-static int tfw_cfg_out_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF;
-static int tfw_cfg_out_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF;
-static int tfw_cfg_out_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF;
-static int tfw_cfg_out_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF;
-static unsigned int tfw_cfg_out_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF;
-static unsigned int tfw_cfg_out_sticky_sess = TFW_CFG_SRV_STICKY_SESS_DEF;
-
-static unsigned int tfw_cfg_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
-static unsigned int tfw_cfg_out_sg_flags = TFW_SG_F_SCHED_RATIO_STATIC;
+static int tfw_cfg_queue_size, tfw_cfg_out_queue_size;
+static int tfw_cfg_fwd_timeout, tfw_cfg_out_fwd_timeout;
+static int tfw_cfg_fwd_retries, tfw_cfg_out_fwd_retries;
+static int tfw_cfg_cns_retries, tfw_cfg_out_cns_retries;
+static unsigned int tfw_cfg_retry_nip, tfw_cfg_out_retry_nip;
+static unsigned int tfw_cfg_sticky_sess, tfw_cfg_out_sticky_sess;
+static unsigned int tfw_cfg_sg_flags, tfw_cfg_out_sg_flags;
 
 static int
 tfw_cfgop_intval(TfwCfgSpec *cs, TfwCfgEntry *ce, int *intval)
 {
-	int ret;
-
 	if (ce->val_n != 1) {
 		TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n);
 			return -EINVAL;
@@ -619,10 +596,9 @@ tfw_cfgop_intval(TfwCfgSpec *cs, TfwCfgEntry *ce, int *intval)
 		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
 		return -EINVAL;
 	}
-	if ((ret = tfw_cfg_parse_int(ce->vals[0], intval)))
-		return ret;
 
-	return 0;
+	cs->dest = intval;
+	return tfw_cfg_set_int(cs, ce);
 }
 
 static int
@@ -664,12 +640,19 @@ tfw_cfgop_out_fwd_retries(TfwCfgSpec *cs, TfwCfgEntry *ce)
 static inline int
 tfw_cfgop_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce, int *retry_nip)
 {
-	if (ce->attr_n || ce->val_n) {
-		TFW_ERR_NL("The option may not have arguments.\n");
+	if (ce->attr_n) {
+		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
+		return -EINVAL;
+	}
+	if (!ce->val_n) {
+		*retry_nip = TFW_SRV_RETRY_NIP;
+	} else if (!strcasecmp(ce->vals[0], TFW_CFG_DFLT_VAL))	{
+		BUG_ON(ce->val_n != 1);
+		*retry_nip = 0;
+	} else {
+		TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n);
 		return -EINVAL;
 	}
-
-	*retry_nip = 1;
 
 	return 0;
 }
@@ -685,18 +668,17 @@ tfw_cfgop_sticky_sess(TfwCfgSpec *cs, TfwCfgEntry *ce, unsigned int *use_sticky)
 		TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n);
 		return -EINVAL;
 	}
-
-	if (ce->val_n) {
-		if (!strcasecmp(ce->vals[0], "allow_failover")) {
-			*use_sticky |= TFW_SRV_STICKY_FAILOVER;
-		} else {
-			TFW_ERR_NL("Unsupported argument: %s\n", ce->vals[0]);
-			return  -EINVAL;
-		}
+	if (!ce->val_n) {
+		*use_sticky = TFW_SRV_STICKY;
+	} else if (!strcasecmp(ce->vals[0], "allow_failover")) {
+		*use_sticky = TFW_SRV_STICKY | TFW_SRV_STICKY_FAILOVER;
+	} else if (!strcasecmp(ce->vals[0], TFW_CFG_DFLT_VAL)) {
+		*use_sticky = 0;
+	} else  {
+		TFW_ERR_NL("Unsupported argument: %s\n", ce->vals[0]);
+		return  -EINVAL;
 	}
 
-	*use_sticky |= TFW_SRV_STICKY;
-
 	return 0;
 }
 
@@ -750,7 +732,13 @@ tfw_cfgop_set_conn_retries(TfwSrvGroup *sg, int recns)
 	return 0;
 }
 
-/*
+/* Default and maximum values for "server" options. */
+#define TFW_CFG_SRV_CONNS_N_DEF		32	/* Default # of connections */
+#define TFW_CFG_SRV_WEIGHT_MIN		1	/* Min static weight value */
+#define TFW_CFG_SRV_WEIGHT_MAX		100	/* Max static weight value */
+#define TFW_CFG_SRV_WEIGHT_DEF		50	/* Dflt static weight value */
+
+/**
  * Common code to handle 'server' directive.
  */
 static int
@@ -812,7 +800,9 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce, struct list_head *slst)
 		return -EINVAL;
 	}
 	/* Default weight is set only for static ratio scheduler. */
-	if (has_weight && ((weight < 1) || (weight > 100))) {
+	if (has_weight && ((weight < TFW_CFG_SRV_WEIGHT_MIN)
+			   || (weight > TFW_CFG_SRV_WEIGHT_MAX)))
+	{
 		TFW_ERR_NL("Out of range of [%d..%d]: 'weight=%d'\n",
 			   TFW_CFG_SRV_WEIGHT_MIN, TFW_CFG_SRV_WEIGHT_MAX,
 			   weight);
@@ -916,6 +906,7 @@ tfw_cfgop_begin_srv_group(TfwCfgSpec *cs, TfwCfgEntry *ce)
 	tfw_cfg_sticky_sess = tfw_cfg_out_sticky_sess;
 	tfw_cfg_sg_flags = tfw_cfg_out_sg_flags;
 	tfw_cfg_sched = tfw_cfg_out_sched;
+	tfw_cfg_schref = tfw_cfg_out_schref;
 
 	BUG_ON(!list_empty(&tfw_cfg_in_slst));
 	tfw_cfg_slst = &tfw_cfg_in_slst;
@@ -975,8 +966,8 @@ tfw_cfgop_setup_srv_group(void)
 	tfw_cfg_sg->max_refwd = tfw_cfg_fwd_retries ? : UINT_MAX;
 
 	tfw_cfg_sg->flags = tfw_cfg_sg_flags;
-	tfw_cfg_sg->flags |= tfw_cfg_retry_nip ? TFW_SRV_RETRY_NIP : 0;
-	tfw_cfg_sg->flags |= tfw_cfg_sticky_sess;
+	tfw_cfg_sg->flags |= tfw_cfg_retry_nip | tfw_cfg_sticky_sess;
+	tfw_cfg_sg->sched_data = tfw_cfg_schref;
 
 	/*
 	 * Check 'ratio' scheduler configuration for incompatibilities.
@@ -1031,63 +1022,166 @@ tfw_cfgop_finish_srv_group(TfwCfgSpec *cs)
 }
 
 static int
-tfw_cfg_handle_ratio(TfwCfgSpec *cs, TfwCfgEntry *ce, unsigned int *sg_flags)
+tfw_cfg_handle_ratio_predyn_opts(TfwCfgEntry *ce, unsigned int *arg_flags)
 {
-	unsigned int idx, flags, value;
+	unsigned int idx, value, flags = *arg_flags;
 
-	if (ce->val_n < 2) {
-		/* Default ratio scheduler type. */
-		flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	} else if (!strcasecmp(ce->vals[1], "static")) {
-		flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	} else if (!strcasecmp(ce->vals[1], "dynamic")) {
-		flags = TFW_SG_F_SCHED_RATIO_DYNAMIC;
-		if (ce->val_n < 3) {
-			/* Default dynamic type. */
-			flags |= TFW_PSTATS_IDX_AVG;
+	if (ce->val_n < 3) {
+		/* Default dynamic type. */
+		flags |= TFW_PSTATS_IDX_AVG;
+		goto done;
+	}
+	if (!strcasecmp(ce->vals[2], "minimum")) {
+		idx = TFW_PSTATS_IDX_MIN;
+	}else if (!strcasecmp(ce->vals[2], "maximum")) {
+		idx = TFW_PSTATS_IDX_MAX;
+	} else if (!strcasecmp(ce->vals[2], "average")) {
+		idx = TFW_PSTATS_IDX_AVG;
+	} else if (!strcasecmp(ce->vals[2], "percentile")) {
+		if (ce->val_n < 4) {
+			/* Default percentile. */
+			flags |= TFW_PSTATS_IDX_P90;
 			goto done;
 		}
-		if (!strcasecmp(ce->vals[2], "minimum")) {
-			idx = TFW_PSTATS_IDX_MIN;
-		}else if (!strcasecmp(ce->vals[2], "maximum")) {
-			idx = TFW_PSTATS_IDX_MAX;
-		} else if (!strcasecmp(ce->vals[2], "average")) {
-			idx = TFW_PSTATS_IDX_AVG;
-		} else if (!strcasecmp(ce->vals[2], "percentile")) {
-			if (ce->val_n < 4) {
-				/* Default percentile. */
-				flags |= TFW_PSTATS_IDX_P90;
-				goto done;
+		if (tfw_cfg_parse_int(ce->vals[3], &value)) {
+			TFW_ERR_NL("Invalid value: '%s'\n", ce->vals[3]);
+			return -EINVAL;
+		}
+		for (idx = 0; idx < ARRAY_SIZE(tfw_pstats_ith); ++idx) {
+			if (!tfw_pstats_ith[idx])
+				continue;
+			if (tfw_pstats_ith[idx] == value)
+				break;
+		}
+		if (idx == ARRAY_SIZE(tfw_pstats_ith)) {
+			TFW_ERR_NL("Invalid value: '%s'\n", ce->vals[3]);
+			return -EINVAL;
+		}
+	} else {
+		TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[2]);
+		return -EINVAL;
+	}
+	flags |= idx;
+
+done:
+	*arg_flags = flags;
+	return 0;
+}
+
+/* Default and maximum values for "sched ratio predict" options. */
+#define TFW_CFG_PAST_DEF	30	/* 30 secs of past APM vals */
+#define TFW_CFG_PAST_MAX	120	/* 120 secs of past APM vals */
+#define TFW_CFG_RATE_DEF	20	/* 20 times/sec */
+#define TFW_CFG_RATE_MAX	20	/* 20 times/sec */
+
+static int
+tfw_cfg_handle_ratio_predict(TfwCfgEntry *ce,
+			     void *arg_schref, unsigned int *arg_flags)
+{
+	int i, ret;
+	const char *key, *val;
+	bool has_past = false, has_rate = false, has_ahead = false;
+	TfwSchrefPredict schref = { 0 };
+
+	if ((ret = tfw_cfg_handle_ratio_predyn_opts(ce, arg_flags)))
+		return ret;
+
+	TFW_CFG_ENTRY_FOR_EACH_ATTR(ce, i, key, val) {
+		if (!strcasecmp(key, "past")) {
+			if (has_past) {
+				TFW_ERR_NL("Duplicate argument: '%s'\n", key);
+				return -EINVAL;
 			}
-			if (tfw_cfg_parse_int(ce->vals[3], &value)) {
-				TFW_ERR_NL("Invalid value: '%s'\n",
-					   ce->vals[3]);
+			if (tfw_cfg_parse_int(val, &schref.past)) {
+				TFW_ERR_NL("Invalid value: '%s'\n", val);
 				return -EINVAL;
 			}
-			for (idx = 0; idx < ARRAY_SIZE(tfw_pstats_ith); ++idx) {
-				if (!tfw_pstats_ith[idx])
-					continue;
-				if (tfw_pstats_ith[idx] == value)
-					break;
+			has_past = true;
+		} else if (!strcasecmp(key, "rate")) {
+			if (has_rate) {
+				TFW_ERR_NL("Duplicate argument: '%s'\n", key);
+				return -EINVAL;
 			}
-			if (idx == ARRAY_SIZE(tfw_pstats_ith)) {
-				TFW_ERR_NL("Invalid value: '%s'\n",
-					   ce->vals[3]);
+			if (tfw_cfg_parse_int(val, &schref.rate)) {
+				TFW_ERR_NL("Invalid value: '%s'\n", val);
 				return -EINVAL;
 			}
-		} else {
-			TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[2]);
-			return -EINVAL;
+			has_rate = true;
+		} else if (!strcasecmp(key, "ahead")) {
+			if (has_ahead) {
+				TFW_ERR_NL("Duplicate argument: '%s'\n", key);
+				return -EINVAL;
+			}
+			if (tfw_cfg_parse_int(val, &schref.ahead)) {
+				TFW_ERR_NL("Invalid value: '%s'\n", val);
+				return -EINVAL;
+			}
+			has_ahead = true;
 		}
-		flags |= idx;
+	}
+	if (!has_past) {
+		schref.past = TFW_CFG_PAST_DEF;
+	} else if ((schref.past < 1) || (schref.past > TFW_CFG_PAST_MAX)) {
+		TFW_ERR_NL("Out of range of [1..%d]: 'past=%d'\n",
+			   TFW_CFG_PAST_MAX, schref.past);
+		return -EINVAL;
+	}
+	if (!has_rate) {
+		schref.rate = TFW_CFG_RATE_DEF;
+	} else if ((schref.rate < 1) || (schref.rate > TFW_CFG_RATE_MAX)) {
+		TFW_ERR_NL("Out of range of [1..%d]: 'rate=%d'\n",
+			   TFW_CFG_RATE_MAX, schref.rate);
+		return -EINVAL;
+	}
+	if (!has_ahead) {
+		schref.ahead = schref.past > 1 ? schref.past / 2 : 1;
+	} else if ((schref.ahead < 1) || (schref.ahead > schref.past / 2)) {
+		TFW_ERR_NL("Out of range of [1..%d]: 'ahead=%d'."
+			   "Can't be greater than half of 'past=%d'.\n",
+			   schref.past / 2, schref.ahead, schref.past);
+		return -EINVAL;
+	}
+
+	*(TfwSchrefPredict *)arg_schref = schref;
+	return 0;
+}
+
+static int
+tfw_cfg_handle_ratio_dynamic(TfwCfgEntry *ce, unsigned int *arg_flags)
+{
+	if (ce->attr_n) {
+		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
+		return -EINVAL;
+	}
+
+	return tfw_cfg_handle_ratio_predyn_opts(ce, arg_flags);
+}
+
+static int
+tfw_cfg_handle_ratio(TfwCfgEntry *ce, void *schref, unsigned int *sg_flags)
+{
+	int ret;
+	unsigned int flags;
+
+	if (ce->val_n < 2) {
+		/* Default ratio scheduler type. */
+		flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	} else if (!strcasecmp(ce->vals[1], "static")) {
+		flags = TFW_SG_F_SCHED_RATIO_STATIC;
+	} else if (!strcasecmp(ce->vals[1], "dynamic")) {
+		flags = TFW_SG_F_SCHED_RATIO_DYNAMIC;
+		if ((ret = tfw_cfg_handle_ratio_dynamic(ce, &flags)))
+			return ret;
+	} else if (!strcasecmp(ce->vals[1], "predict")) {
+		flags = TFW_SG_F_SCHED_RATIO_PREDICT;
+		if ((ret = tfw_cfg_handle_ratio_predict(ce, schref, &flags)))
+			return ret;
 	} else {
 		TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[1]);
 		return -EINVAL;
 	}
 
-done:
 	*sg_flags = flags;
-
 	return 0;
 }
 
@@ -1095,8 +1189,8 @@ tfw_cfg_handle_ratio(TfwCfgSpec *cs, TfwCfgEntry *ce, unsigned int *sg_flags)
  * Common code to handle 'sched' directive.
  */
 static int
-tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
-		     TfwScheduler **arg_sched, unsigned int *sg_flags)
+tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce, TfwScheduler **arg_sched,
+		void *schref, unsigned int *sg_flags)
 {
 	TfwScheduler *sched;
 
@@ -1104,10 +1198,6 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
 		TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n);
 		return -EINVAL;
 	}
-	if (ce->attr_n) {
-		TFW_ERR_NL("Arguments may not have the \'=\' sign\n");
-		return -EINVAL;
-	}
 
 	if (!(sched = tfw_sched_lookup(ce->vals[0]))) {
 		TFW_ERR_NL("Unrecognized scheduler: '%s'\n", ce->vals[0]);
@@ -1115,7 +1205,7 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
 	}
 
 	if (!strcasecmp(sched->name, "ratio"))
-		if (tfw_cfg_handle_ratio(cs, ce, sg_flags))
+		if (tfw_cfg_handle_ratio(ce, schref, sg_flags))
 			return -EINVAL;
 
 	*arg_sched = sched;
@@ -1126,14 +1216,20 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce,
 static int
 tfw_cfgop_in_sched(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
+	tfw_cfg_schref = &tfw_cfg_schref_predict;
+
 	return tfw_cfgop_sched(cs, ce, &tfw_cfg_sched,
+				       tfw_cfg_schref,
 				       &tfw_cfg_sg_flags);
 }
 
 static int
 tfw_cfgop_out_sched(TfwCfgSpec *cs, TfwCfgEntry *ce)
 {
+	tfw_cfg_out_schref = &tfw_cfg_out_schref_predict;
+
 	return tfw_cfgop_sched(cs, ce, &tfw_cfg_out_sched,
+				       tfw_cfg_out_schref,
 				       &tfw_cfg_out_sg_flags);
 }
 
@@ -1158,6 +1254,7 @@ tfw_clean_srv_groups(TfwCfgSpec *cs)
 
 	tfw_cfg_sg = tfw_cfg_out_sg = NULL;
 	tfw_cfg_sched = tfw_cfg_out_sched = NULL;
+	tfw_cfg_schref = tfw_cfg_out_schref = NULL;
 	tfw_cfg_slstsz = tfw_cfg_out_slstsz = 0;
 
 	tfw_sock_srv_delete_all_conns();
@@ -1174,7 +1271,7 @@ tfw_sock_srv_start(void)
 	 * a server outside of any group is found in the configuration.
 	 */
 	if (tfw_cfg_out_slstsz) {
-		tfw_cfg_out_sg = tfw_sg_new(TFW_CFG_SG_NAME_DEF, GFP_KERNEL);
+		tfw_cfg_out_sg = tfw_sg_new("default", GFP_KERNEL);
 		if (!tfw_cfg_out_sg) {
 			TFW_ERR_NL("Unable to add default server group\n");
 			return -EINVAL;
@@ -1184,12 +1281,13 @@ tfw_sock_srv_start(void)
 		tfw_cfg_queue_size  = tfw_cfg_out_queue_size;
 		tfw_cfg_fwd_timeout = tfw_cfg_out_fwd_timeout;
 		tfw_cfg_fwd_retries = tfw_cfg_out_fwd_retries;
-		tfw_cfg_retry_nip = tfw_cfg_out_retry_nip;
 		tfw_cfg_sticky_sess = tfw_cfg_out_sticky_sess;
+		tfw_cfg_retry_nip = tfw_cfg_out_retry_nip;
 		tfw_cfg_sg_flags = tfw_cfg_out_sg_flags;
 		tfw_cfg_slst = &tfw_cfg_out_slst;
 		tfw_cfg_slstsz = tfw_cfg_out_slstsz;
 		tfw_cfg_sched = tfw_cfg_out_sched;
+		tfw_cfg_schref = tfw_cfg_out_schref;
 		tfw_cfg_sg = tfw_cfg_out_sg;
 
 		if ((ret = tfw_cfgop_setup_srv_group()))
@@ -1220,49 +1318,61 @@ static TfwCfgSpec tfw_srv_group_specs[] = {
 		.cleanup = tfw_clean_srv_groups
 	},
 	{
-		"sched", "ratio",
+		"sched", "ratio static",
 		tfw_cfgop_in_sched,
 		.allow_none = true,
 		.allow_repeat = false,
 		.cleanup = tfw_clean_srv_groups,
 	},
 	{
-		"server_queue_size", NULL,
+		"server_queue_size", "1000",
 		tfw_cfgop_in_queue_size,
 		.allow_none = true,
 		.allow_repeat = false,
 		.cleanup = tfw_clean_srv_groups,
+		.spec_ext = &(TfwCfgSpecInt) {
+			.range = { 0, INT_MAX },
+		},
 	},
 	{
-		"server_forward_timeout", NULL,
+		"server_forward_timeout", "60",
 		tfw_cfgop_in_fwd_timeout,
 		.allow_none = true,
 		.allow_repeat = false,
 		.cleanup = tfw_clean_srv_groups,
+		.spec_ext = &(TfwCfgSpecInt) {
+			.range = { 0, INT_MAX },
+		},
 	},
 	{
-		"server_forward_retries", NULL,
+		"server_forward_retries", "5",
 		tfw_cfgop_in_fwd_retries,
 		.allow_none = true,
 		.allow_repeat = false,
 		.cleanup = tfw_clean_srv_groups,
+		.spec_ext = &(TfwCfgSpecInt) {
+			.range = { 0, INT_MAX },
+		},
 	},
 	{
-		"server_retry_nonidempotent", NULL,
+		"server_retry_nonidempotent", TFW_CFG_DFLT_VAL,
 		tfw_cfgop_in_retry_nip,
 		.allow_none = true,
 		.allow_repeat = false,
 		.cleanup = tfw_clean_srv_groups,
 	},
 	{
-		"server_connect_retries", NULL,
+		"server_connect_retries", "10",
 		tfw_cfgop_in_conn_retries,
 		.allow_none = true,
 		.allow_repeat = false,
 		.cleanup = tfw_clean_srv_groups,
+		.spec_ext = &(TfwCfgSpecInt) {
+			.range = { 0, INT_MAX },
+		},
 	},
 	{
-		"sticky_sessions", NULL,
+		"sticky_sessions", TFW_CFG_DFLT_VAL,
 		tfw_cfgop_in_sticky_sess,
 		.allow_none = true,
 		.allow_repeat = false,
@@ -1284,49 +1394,61 @@ TfwCfgMod tfw_sock_srv_cfg_mod = {
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
-			"sched", "ratio",
+			"sched", "ratio static",
 			tfw_cfgop_out_sched,
 			.allow_none = true,
 			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
-			"server_queue_size", NULL,
+			"server_queue_size", "1000",
 			tfw_cfgop_out_queue_size,
 			.allow_none = true,
 			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
+			.spec_ext = &(TfwCfgSpecInt) {
+				.range = { 0, INT_MAX },
+			},
 		},
 		{
-			"server_forward_timeout", NULL,
+			"server_forward_timeout", "60",
 			tfw_cfgop_out_fwd_timeout,
 			.allow_none = true,
 			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
+			.spec_ext = &(TfwCfgSpecInt) {
+				.range = { 0, INT_MAX },
+			},
 		},
 		{
-			"server_forward_retries", NULL,
+			"server_forward_retries", "5",
 			tfw_cfgop_out_fwd_retries,
 			.allow_none = true,
 			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
+			.spec_ext = &(TfwCfgSpecInt) {
+				.range = { 0, INT_MAX },
+			},
 		},
 		{
-			"server_retry_non_idempotent", NULL,
+			"server_retry_non_idempotent", TFW_CFG_DFLT_VAL,
 			tfw_cfgop_out_retry_nip,
 			.allow_none = true,
 			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
 		},
 		{
-			"server_connect_retries", NULL,
+			"server_connect_retries", "10",
 			tfw_cfgop_out_conn_retries,
 			.allow_none = true,
 			.allow_repeat = false,
 			.cleanup = tfw_clean_srv_groups,
+			.spec_ext = &(TfwCfgSpecInt) {
+				.range = { 0, INT_MAX },
+			},
 		},
 		{
-			"sticky_sessions", NULL,
+			"sticky_sessions", TFW_CFG_DFLT_VAL,
 			tfw_cfgop_out_sticky_sess,
 			.allow_none = true,
 			.allow_repeat = false,

From 8c4e508ee0f489140b716a5b885abad0e23fc5f3 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 20 Apr 2017 00:11:29 +0300
Subject: [PATCH 20/37] Wait for RCU callbacks to complete before releasing
 memory.

---
 tempesta_fw/sched/tfw_sched_ratio.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 64cb577df..275405310 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -968,12 +968,21 @@ tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
 
 /**
  * Delete a server group from Ratio Scheduler.
+ *
+ * Note that at this time the group is inactive. That means there are no
+ * attempts to schedule to servers in this group and enter RCU read-side
+ * critical section. There's no need for synchronize_rcu() to wait for
+ * expiration of an RCU grace period.
  */
 static void
 tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 {
 	TfwRatioPool *rpool = sg->sched_data;
 
+	/*
+	 * Make sure the timer doesn't re-arms itself. This
+	 * also ensures that no more RCU callbacks are created.
+	 */
 	if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
 			 | TFW_SG_F_SCHED_RATIO_PREDICT))
 	{
@@ -981,7 +990,11 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 		smp_mb__after_atomic();
 		del_timer_sync(&rpool->timer);
 	}
-	synchronize_rcu();
+
+	/* Wait for outstanding RCU callbacks to complete. */
+	rcu_barrier();
+
+	/* Release all memory allocated for the group. */
 	tfw_sched_ratio_cleanup(sg);
 }
 

From 7d167e612ba272ab227f8c8937ab5b786a490d2b Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 24 Apr 2017 01:00:20 +0300
Subject: [PATCH 21/37] Remove a tiny piece of dead code.

---
 tempesta_fw/sched/tfw_sched_ratio.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 275405310..7bc798b52 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -790,8 +790,6 @@ tfw_sched_ratio_next_srv(TfwRatioPool *rpool, TfwRatio *ratio)
 	schdata->csidx = 0;
 	++schdata->riter;
 	goto retry;
-
-	spin_unlock(&schdata->lock);
 }
 
 /*

From 3eaab0fb9da2233af733675276f750be77716975 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 24 Apr 2017 16:22:00 +0300
Subject: [PATCH 22/37] Count the remainder from division by two (even/odd
 numbers).

---
 tempesta_fw/apm.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 600834806..c805155b9 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -138,7 +138,7 @@ static void
 __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r)
 {
 	int i;
-	unsigned long tmp;
+	unsigned long cnt_full, cnt_half;
 
 	--pc->order;
 	pc->begin = pc->end - ((TFW_STATS_BCKTS - 1) << pc->order);
@@ -153,14 +153,15 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r)
 	 */
 	for (i = 1; i < TFW_STATS_BCKTS / 2; ++i)
 		atomic_add(atomic_read(&rng->cnt[r][i]), &rng->cnt[r][0]);
-	tmp = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2]) / 2;
-	atomic_add(tmp, &rng->cnt[r][0]);
-	atomic_set(&rng->cnt[r][1], tmp);
+	cnt_full = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2]);
+	cnt_half = cnt_full / 2;
+	atomic_add(cnt_half, &rng->cnt[r][0]);
+	atomic_set(&rng->cnt[r][1], cnt_full - cnt_half);
 	for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) {
-		tmp = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2 + i]);
-		tmp /= 2;
-		atomic_set(&rng->cnt[r][i * 2], tmp);
-		atomic_set(&rng->cnt[r][i * 2 + 1], tmp);
+		cnt_full = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2 + i]);
+		cnt_half = cnt_full / 2;
+		atomic_set(&rng->cnt[r][i * 2], cnt_half);
+		atomic_set(&rng->cnt[r][i * 2 + 1], cnt_full - cnt_half);
 	}
 }
 

From 50d3a833c3d20536ec22d7db6704d97d8def079c Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 27 Apr 2017 20:15:39 +0300
Subject: [PATCH 23/37] Fix the initial value of RBCTL's entry_cnt in APM.

---
 tempesta_fw/apm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index c805155b9..d400e4765 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -711,8 +711,9 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc)
 
 		for (i = 0; i < rbuf->rbufsz; ++i)
 			total_cnt += atomic64_read(&rbent[i].pcntrng.tot_cnt);
+		entry_cnt = atomic64_read(&rbent[centry].pcntrng.tot_cnt);
 
-		rbctl->entry_cnt = 0;
+		rbctl->entry_cnt = entry_cnt;
 		rbctl->total_cnt = total_cnt;
 		rbctl->jtmwstamp = jtmwstart;
 

From 12280c9950269a5585ad202f0b4bfc5d73afd4e2 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 28 Apr 2017 02:43:13 +0300
Subject: [PATCH 24/37] Fix a WARNING() spewed by rcu_process_callbacks().

---
 tempesta_fw/sched/tfw_sched_ratio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 7bc798b52..559d897e8 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -1055,7 +1055,6 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 		return -ENOMEM;
 	rpool = sg->sched_data;
 	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
-	rpool->ratio = rpool->rpool;
 
 	/* Array for server descriptors. Shared between RCU pool entries. */
 	size = sizeof(TfwRatioSrvDesc) * sg->srv_n;
@@ -1123,6 +1122,7 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 	}
 
 	/* Calculate initial ratios for each server. */
+	rcu_assign_pointer(rpool->ratio, tfw_sched_ratio_rpool_get(rpool));
 	tfw_sched_ratio_calc_static(rpool, rpool->ratio);
 
 	/* Set up periodic re-calculation of ratios. */

From 3d75192398933cbc9012dcec9edfc19473b0851e Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 28 Apr 2017 14:00:39 +0300
Subject: [PATCH 25/37] Serialize APM stats data updates and APM stats values
 calculation.

The serialization is implemented via lockless work queues. Stats
data for each server is put on a work queue (one work queue per CPU).
Work queues are processed periodically, and the stats data for each
server is updated in serialized manner. After that new stats values
are calculated for each server with updated stats data.

This provides a number of important benefits. Among those are:
- Stats data for each server is now consistent at all times. There
  are no concurrent updates while new stats values are calculated.
- Both stats updates and stats values calculations are now totally
  lockless.

Note that APM data is now decoupled from the server structure. It's
attached to or detached from a server's instance, but it's managed
completely by the APM module with help of a reference counter. Thus
it may have a different life cycle than a server's instance.

It's possible that a different solution may be needed to process
work queues in a timely manner in high load situations. Periodic
processing on timer may be insufficient to handle the workload.
---
 tempesta_fw/apm.c                   | 327 +++++++++++++++++++---------
 tempesta_fw/apm.h                   |  11 +-
 tempesta_fw/http.c                  |   2 +-
 tempesta_fw/procfs.c                |   2 +-
 tempesta_fw/sched/tfw_sched_ratio.c |   2 +-
 tempesta_fw/server.c                |  12 +-
 tempesta_fw/server.h                |   5 +-
 tempesta_fw/sock_srv.c              |   2 +-
 8 files changed, 242 insertions(+), 121 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index d400e4765..373fca9cc 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -29,6 +29,7 @@
 #include "log.h"
 #include "pool.h"
 #include "procfs.h"
+#include "work_queue.h"
 
 /*
  * The algorithm is constructed to be as efficient as possible. That's
@@ -204,7 +205,7 @@ tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time)
  * largest response time faced.
  */
 static void
-__tfw_stats_adjust(TfwPcntRanges *rng, int r)
+tfw_stats_adjust(TfwPcntRanges *rng, int r)
 {
 	TfwPcntCtl pc;
 	unsigned long i, cnt = 0, sum = 0, max = 0, i_max = 0;
@@ -264,23 +265,6 @@ __tfw_stats_adjust(TfwPcntRanges *rng, int r)
 	}
 }
 
-/*
- * See if the range @r contains large outliers. Adjust it if so.
- * This is the locked version.
- *
- * If the lock is busy then either the ranges are being adjusted
- * or the percentiles are being calculated at this very moment.
- * Just skip the adjustment of ranges and do it next time.
- */
-static inline void
-tfw_stats_adjust(TfwPcntRanges *rng, int r, spinlock_t *slock)
-{
-	if (!spin_trylock(slock))
-		return;
-	__tfw_stats_adjust(rng, r);
-	spin_unlock(slock);
-}
-
 /*
  * Set the new maximum value.
  * Return true if the new value has been set.
@@ -331,7 +315,7 @@ tfw_stats_adj_min(TfwPcntRanges *rng, unsigned int r_time)
  * We only care about correct array indexing.
  */
 static void
-tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock)
+tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time)
 {
 	TfwPcntCtl pc3, pc2 = { .atomic = rng->ctl[2].atomic };
 
@@ -346,7 +330,7 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock)
 		TfwPcntCtl pc0, pc1 = { .atomic = rng->ctl[1].atomic };
 		if (pc1.end < r_time) {
 			atomic_inc(__rng(&pc2, rng->cnt[2], r_time));
-			tfw_stats_adjust(rng, 2, slock);
+			tfw_stats_adjust(rng, 2);
 			atomic64_inc(&rng->tot_cnt);
 			return;
 		}
@@ -355,27 +339,24 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock)
 		BUG_ON(pc0.begin != 1); /* left bound is never moved */
 		if (pc0.end < r_time) {
 			atomic_inc(__rng(&pc1, rng->cnt[1], r_time));
-			tfw_stats_adjust(rng, 1, slock);
+			tfw_stats_adjust(rng, 1);
 			atomic64_inc(&rng->tot_cnt);
 			return;
 		}
 		atomic_inc(__rng(&pc0, rng->cnt[0], r_time));
-		tfw_stats_adjust(rng, 0, slock);
+		tfw_stats_adjust(rng, 0);
 		atomic64_inc(&rng->tot_cnt);
 		return;
 	}
 
-	if (!spin_trylock(slock))
-		return;
 	pc3.atomic = rng->ctl[3].atomic;
 	if (unlikely(r_time > pc3.end)) {
 		tfw_stats_extend(rng, r_time);
 		pc3.atomic = rng->ctl[3].atomic;
 	}
 	atomic_inc(__rng(&pc3, rng->cnt[3], r_time));
-	__tfw_stats_adjust(rng, 3);
+	tfw_stats_adjust(rng, 3);
 	atomic64_inc(&rng->tot_cnt);
-	spin_unlock(slock);
 }
 
 /*
@@ -388,6 +369,7 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock)
  */
 /*
  * A ring buffer entry structure.
+ *
  * @pcntrng	- Struct for response time data by the percentiles algorithm.
  * @jtmistamp	- The start of the time interval for the current entry.
  * @reset	- The entry can be reset by one thread at a time.
@@ -400,8 +382,10 @@ typedef struct {
 
 /*
  * The ring buffer contol structure.
+ *
  * This is a supporting structure. It keeps related data that is useful
  * in making decisions on the need of recalculation of percentiles.
+ *
  * @jtmwstamp	- The start of the time window the percentiles are for.
  * @entry_cnt	- The number of hits in the current buffer ring entry.
  * @total_cnt	- The number of hits within the current time window.
@@ -415,6 +399,7 @@ typedef struct {
 
 /*
  * The ring buffer structure.
+ *
  * @rbent	- Array of ring buffer entries.
  * @slock	- The lock to adjust the ranges in the current entry.
  * @rbufsz	- The size of @rbent.
@@ -428,6 +413,7 @@ typedef struct {
 /*
  * The stats entry data structure.
  * Keeps the latest values of calculated percentiles.
+ *
  * @pstats	- The percentile stats structure.
  * @rwlock	- Protect updates.
  */
@@ -438,6 +424,7 @@ typedef struct {
 
 /*
  * The stats data structure.
+ *
  * There's only one updater that runs on timer. It calculates the latest
  * percentiles and updates the stored values. There are multiple readers
  * of the stored values. The stored values of the latest percentiles are
@@ -446,6 +433,7 @@ typedef struct {
  * at @asent[@rdidx % 2]. The writer writes the new percentile values to
  * @asent[(@rdidx + 1) % 2], and then increments @rdidx. The reading and
  * the writing are protected by a rwlock.
+ *
  * @asent	- The stats entries for reading/writing (flip-flop manner).
  * @rdidx	- The current index in @asent for readers.
  */
@@ -456,27 +444,33 @@ typedef struct {
 
 /*
  * APM Data structure.
+ *
  * Note that the organization of the supporting data heavily depends
  * on the fact that there's only one party that does the calculation
  * of percentiles - the function that runs periodically on timer.
  * If there are several different parties that do the calculation,
  * then the data may need to be organized differently.
+ *
+ * @list	- Member in @tfw_apm_qcalc or @tfw_apm_qrecalc.
  * @rbuf	- The ring buffer for the specified time window.
  * @rbctl	- The control data helpful in taking optimizations.
  * @stats	- The latest percentiles.
  * @timer	- The periodic timer handle.
  * @flags	- The atomic flags (see below).
+ * @refcnt	- The reference count.
  */
-#define TFW_APM_DATA_F_REARM	(0x0001)	/* Re-arm the timer. */
-#define TFW_APM_DATA_F_RECALC	(0x0002)	/* Need to recalculate. */
+#define TFW_APM_DATA_F_RECALC	(0x0001)	/* Need to recalculate. */
+#define TFW_APM_DATA_F_UPDONE	(0x0002)	/* RTT update done. */
 #define TFW_APM_TIMER_TIMEOUT	(HZ/20)		/* The timer periodicity. */
 
 typedef struct {
+	struct list_head	list;
 	TfwApmRBuf		rbuf;
 	TfwApmRBCtl		rbctl;
 	TfwApmStats		stats;
 	struct timer_list	timer;
 	unsigned long		flags;
+	atomic_t		refcnt;
 } TfwApmData;
 
 /*
@@ -494,6 +488,30 @@ static int tfw_apm_jtmwindow;		/* Time window in jiffies. */
 static int tfw_apm_jtmintrvl;		/* Time interval in jiffies. */
 static int tfw_apm_tmwscale;		/* Time window scale. */
 
+/* Work Queue item for stats data. */
+typedef struct {
+	TfwApmData	*data;
+	unsigned long	jtstamp;
+	unsigned long	rtt;
+	unsigned long	__pad;
+} TfwApmWqItem;
+
+/* A Work Queue on each CPU. */
+static DEFINE_PER_CPU(TfwRBQueue, tfw_apm_wq);
+
+/*
+ * @tfw_apm_qcalc       - List of servers that require stats calculation.
+ * @tfw_apm_qrecalc     - List of servers that require stats re-calculation.
+ * @tfw_apm_rearm       - Atomic flag, tells if the timer needs re-arming.
+ * @tfw_apm_timer       - The periodic timer handle.
+ */
+#define TFW_APM_DATA_F_REARM    (0x0001)        /* Re-arm the timer. */
+
+static struct list_head tfw_apm_qcalc;
+static struct list_head tfw_apm_qrecalc;
+static unsigned long tfw_apm_rearm;
+static struct timer_list tfw_apm_timer;
+
 /*
  * Get the next bucket in the ring buffer entry that has a non-zero
  * hits count. Set the bucket's sequential number, the range number,
@@ -501,6 +519,7 @@ static int tfw_apm_tmwscale;		/* Time window scale. */
  */
 /*
  * Ring buffer entry state structure.
+ *
  * @v	- The response time value.
  * @i	- The current sequential bucket number across all ranges.
  * @r	- The current range number.
@@ -537,7 +556,8 @@ __tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st)
 			return;
 		}
 	}
-	__tfw_apm_state_set(st, USHRT_MAX, i, r, b);
+	__tfw_apm_state_set(st, USHRT_MAX, TFW_STATS_RANGES * TFW_STATS_BCKTS,
+					   TFW_STATS_RANGES, TFW_STATS_BCKTS);
 }
 
 static inline void
@@ -754,34 +774,11 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc)
 /*
  * Calculate the latest percentiles if necessary.
  *
- * Return the number of percentile values that have been filled
- * if potentially new percentile values were calculated.
- * Return 0 if the percentile values didn't need the recalculation.
- * REturn -1 of the recalculation could not be performed.
+ * Return 0 if the calculation is successful.
+ * Return < 0 if there was a system error.
+ * Return > 0 and < @prcntlsz if the calculation is incomplete.
  */
 static int
-__tfw_apm_calc(TfwApmData *data, TfwPrcntlStats *pstats, int recalc)
-{
-	int ret;
-
-	if (!spin_trylock(&data->rbuf.slock))
-		return -1;
-	if ((ret = tfw_apm_rbctl_update(data, recalc)))
-		ret = tfw_apm_prnctl_calc(&data->rbuf, &data->rbctl, pstats);
-	spin_unlock(&data->rbuf.slock);
-
-	return ret;
-}
-
-/*
- * Calculate the latest percentiles if necessary.
- *
- * Note that this function may also be used concurrently by other users
- * than the kernel timer function in this module, should the need arise.
- * That should only be done in exceptional cases (like testing), because
- * it would increase @data->rbuf->slock lock contention.
- */
-static void
 tfw_apm_calc(TfwApmData *data)
 {
 	int nfilled, recalc;
@@ -798,9 +795,11 @@ tfw_apm_calc(TfwApmData *data)
 	asent = &data->stats.asent[(rdidx + 1) % 2];
 
 	recalc = test_and_clear_bit(TFW_APM_DATA_F_RECALC, &data->flags);
-	nfilled = __tfw_apm_calc(data, &pstats, recalc);
+	if (!tfw_apm_rbctl_update(data, recalc))
+		return 0;
+	nfilled = tfw_apm_prnctl_calc(&data->rbuf, &data->rbctl, &pstats);
 	if (!nfilled)
-		return;
+		return 0;
 
 	if (nfilled < asent->pstats.psz) {
 		TFW_DBG3("%s: Percentile calculation incomplete.\n", __func__);
@@ -813,22 +812,8 @@ tfw_apm_calc(TfwApmData *data)
 		atomic_inc(&data->stats.rdidx);
 		write_unlock(&asent->rwlock);
 	}
-}
-
-/*
- * Calculate the latest percentiles if necessary.
- * Runs periodically on timer. 
- */
-static void
-tfw_apm_pstats_fn(unsigned long fndata)
-{
-	TfwApmData *data = (TfwApmData *)fndata;
-
-	tfw_apm_calc(data);
 
-	smp_mb__before_atomic();
-	if (test_bit(TFW_APM_DATA_F_REARM, &data->flags))
-		mod_timer(&data->timer, jiffies + TFW_APM_TIMER_TIMEOUT);
+	return nfilled % asent->pstats.psz;
 }
 
 /*
@@ -892,6 +877,19 @@ tfw_apm_pstats_verify(TfwPrcntlStats *pstats)
 	return 0;
 }
 
+static inline void
+tfw_apm_data_put(TfwApmData *data)
+{
+	if (atomic_dec_and_test(&data->refcnt))
+		kfree(data);
+}
+
+static inline void
+tfw_apm_data_get(TfwApmData *data)
+{
+	atomic_inc(&data->refcnt);
+}
+
 static inline void
 __tfw_apm_update(TfwApmRBuf *rbuf, unsigned long jtstamp, unsigned int rtt)
 {
@@ -900,39 +898,107 @@ __tfw_apm_update(TfwApmRBuf *rbuf, unsigned long jtstamp, unsigned int rtt)
 	TfwApmRBEnt *crbent = &rbuf->rbent[centry];
 
 	tfw_apm_rbent_checkreset(crbent, jtmistart);
-	tfw_stats_update(&crbent->pcntrng, rtt, &rbuf->slock);
+	tfw_stats_update(&crbent->pcntrng, rtt);
 }
 
-void
-tfw_apm_update(void *apmdata, unsigned long jtstamp, unsigned long jrtt)
+/*
+ * Calculate the latest percentiles if necessary.
+ * Runs periodically on timer.
+ */
+static void
+tfw_apm_prcntl_tmfn(unsigned long fndata)
 {
-	unsigned int rtt = jiffies_to_msecs(jrtt);
+	int cpu, interval = TFW_APM_TIMER_TIMEOUT;
+	TfwApmData *data, *tmp;
+
+	/* No arguments. */
+	BUG_ON(fndata);
 
-	BUG_ON(!apmdata);
 	/*
-	 * APM stats can't handle response times that are greater than
-	 * the maximum value possible for TfwPcntCtl{}->end. Currently
-	 * the value is USHRT_MAX which is about 65 secs in milliseconds.
+	 * Process work queues on all CPUs and update stats with data
+	 * from each work item in the queue. Add servers with updated
+	 * stats to the list for calculation of stats. Each server is
+	 * is added to the list just once.
+	 *
+	 * If server's APM data is already on the list, that means it
+	 * is on @qrecalc list. Just remove it from @qrecalc list and
+	 * it will be put on @qcalc list as usual for calculation of
+	 * stats values. Note that this is a highly unlikely case.
+	 *
+	 * Note that if server needs a recalculation of stats values,
+	 * it makes sense only if there were updates to server's stats
+	 * data. If there's no updates then a recalculation will lead
+	 * to the same (insufficient) result.
+	 */
+	for_each_online_cpu(cpu) {
+		TfwApmWqItem wq_item;
+		TfwRBQueue *wq = &per_cpu(tfw_apm_wq, cpu);
+
+		while (!tfw_wq_pop(wq, &wq_item)) {
+			data = wq_item.data;
+			__tfw_apm_update(&data->rbuf,
+					 wq_item.jtstamp, wq_item.rtt);
+			if (data->flags & TFW_APM_DATA_F_UPDONE) {
+				tfw_apm_data_put(data);
+				continue;
+			}
+			if (unlikely(!list_empty(&data->list)))
+				list_del_init(&data->list);
+			data->flags |= TFW_APM_DATA_F_UPDONE;
+			list_add_tail(&data->list, &tfw_apm_qcalc);
+		}
+	}
+	/*
+	 * Calculate stats values for each server that has been updated.
+	 * If the calculation cannot be completed with the current data,
+	 * then move that server to a separate list. When stats data is
+	 * updated, the calculation will be repeated.
+	 */
+	list_for_each_entry_safe(data, tmp, &tfw_apm_qcalc, list) {
+		BUG_ON(!(data->flags & TFW_APM_DATA_F_UPDONE));
+		list_del_init(&data->list);
+		data->flags &= ~TFW_APM_DATA_F_UPDONE;
+		if (unlikely(tfw_apm_calc(data))) {
+			list_add_tail(&data->list, &tfw_apm_qrecalc);
+			continue;
+		}
+		tfw_apm_data_put(data);
+	}
+
+	/*
+	 * Recalculation of stats values is needed for some servers.
+	 * Do it ASAP in anticipation that will be updates to stats
+	 * data for those servers.
 	 */
-	if (likely(rtt < (1UL << FIELD_SIZEOF(TfwPcntCtl, end) * 8)))
-		__tfw_apm_update(&((TfwApmData *)apmdata)->rbuf, jtstamp, rtt);
+	if (unlikely(!list_empty(&tfw_apm_qrecalc)))
+		interval = 1;
+
+	smp_mb();
+	if (test_bit(TFW_APM_DATA_F_REARM, &tfw_apm_rearm))
+		mod_timer(&tfw_apm_timer, jiffies + interval);
 }
 
-/*
- * Destroy the specified APM ring buffer.
- */
 void
-tfw_apm_destroy(void *apmdata)
+tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtt)
 {
-	TfwApmData *data = apmdata;
-
-	if (!data)
-		return;
-	clear_bit(TFW_APM_DATA_F_REARM, &data->flags);
-	smp_mb__after_atomic();
-	del_timer_sync(&data->timer);
+	unsigned int rtt = jiffies_to_msecs(jrtt);
 
-	kfree(data);
+	BUG_ON(!apmref);
+	/*
+	 * APM stats can't handle response times that are greater than
+	 * the maximum value possible for TfwPcntCtl{}->end. Currently
+	 * the value is USHRT_MAX which is about 65 secs in milliseconds.
+	 */
+	if (likely(rtt < (1UL << FIELD_SIZEOF(TfwPcntCtl, end) * 8))) {
+		TfwApmWqItem wq_item = {
+			.data = apmref,
+			.jtstamp = jtstamp,
+			.rtt = rtt,
+		};
+		tfw_apm_data_get(wq_item.data);
+		if (__tfw_wq_push(this_cpu_ptr(&tfw_apm_wq), &wq_item, 0))
+			tfw_apm_data_put(wq_item.data);
+	}
 }
 
 /*
@@ -994,14 +1060,36 @@ tfw_apm_create(void)
 	rwlock_init(&data->stats.asent[1].rwlock);
 	atomic_set(&data->stats.rdidx, 0);
 
-	/* Start the timer for the percentile calculation. */
-	set_bit(TFW_APM_DATA_F_REARM, &data->flags);
-	setup_timer(&data->timer, tfw_apm_pstats_fn, (unsigned long)data);
-	mod_timer(&data->timer, jiffies + TFW_APM_TIMER_TIMEOUT);
+	INIT_LIST_HEAD(&data->list);
 
 	return data;
 }
 
+int
+tfw_apm_add_srv(TfwServer *srv)
+{
+	TfwApmData *data;
+
+	BUG_ON(srv->apmref);
+
+	if (!(data = tfw_apm_create()))
+		return -ENOMEM;
+
+	tfw_apm_data_get(data);
+	srv->apmref = data;
+
+	return 0;
+}
+
+void
+tfw_apm_del_srv(TfwServer *srv)
+{
+	TfwApmData *data = srv->apmref;
+
+	srv->apmref = NULL;
+	tfw_apm_data_put(data);
+}
+
 #define TFW_APM_MIN_TMWSCALE	1	/* Minimum time window scale. */
 #define TFW_APM_MAX_TMWSCALE	50	/* Maximum time window scale. */
 #define TFW_APM_DEF_TMWSCALE	5	/* Default time window scale. */
@@ -1015,6 +1103,7 @@ tfw_apm_create(void)
 static int
 tfw_apm_cfg_start(void)
 {
+	int cpu;
 	unsigned int jtmwindow;
 
 	if (!tfw_apm_jtmwindow)
@@ -1052,9 +1141,50 @@ tfw_apm_cfg_start(void)
 	}
 	tfw_apm_jtmwindow = tfw_apm_jtmintrvl * tfw_apm_tmwscale;
 
+	TFW_WQ_CHECKSZ(TfwApmWqItem);
+	for_each_online_cpu(cpu) {
+		TfwRBQueue *wq = &per_cpu(tfw_apm_wq, cpu);
+		tfw_wq_init(wq, cpu_to_node(cpu));
+	}
+
+	tfw_apm_rearm = 0;
+	INIT_LIST_HEAD(&tfw_apm_qcalc);
+	INIT_LIST_HEAD(&tfw_apm_qrecalc);
+
+	/* Start the timer for the percentile calculation. */
+	set_bit(TFW_APM_DATA_F_REARM, &tfw_apm_rearm);
+	setup_timer(&tfw_apm_timer, tfw_apm_prcntl_tmfn, 0UL);
+	mod_timer(&tfw_apm_timer, jiffies + TFW_APM_TIMER_TIMEOUT);
+
 	return 0;
 }
 
+static void
+tfw_apm_cfg_stop(void)
+{
+	int cpu;
+	TfwApmData *data, *tmp;
+
+	clear_bit(TFW_APM_DATA_F_REARM, &tfw_apm_rearm);
+	smp_mb__after_atomic();
+	del_timer_sync(&tfw_apm_timer);
+
+	for_each_online_cpu(cpu) {
+		TfwApmWqItem wq_item;
+		TfwRBQueue *wq = &per_cpu(tfw_apm_wq, cpu);
+
+		while (!tfw_wq_pop(wq, &wq_item))
+			tfw_apm_data_put(wq_item.data);
+
+		tfw_wq_destroy(wq);
+	}
+	list_for_each_entry_safe(data, tmp, &tfw_apm_qrecalc, list) {
+		list_del_init(&data->list);
+		tfw_apm_data_put(data);
+	}
+	BUG_ON(!list_empty(&tfw_apm_qcalc));
+}
+
 /**
  * Cleanup the configuration values when when all server groups are stopped
  * and the APM timers are deleted.
@@ -1112,5 +1242,6 @@ static TfwCfgSpec tfw_apm_cfg_specs[] = {
 TfwCfgMod tfw_apm_cfg_mod = {
 	.name  = "apm",
 	.start = tfw_apm_cfg_start,
+	.stop = tfw_apm_cfg_stop,
 	.specs = tfw_apm_cfg_specs,
 };
diff --git a/tempesta_fw/apm.h b/tempesta_fw/apm.h
index 686eb32d1..3f529def2 100644
--- a/tempesta_fw/apm.h
+++ b/tempesta_fw/apm.h
@@ -21,6 +21,7 @@
 #define __TFW_APM_H__
 
 #include "pool.h"
+#include "server.h"
 
 /*
  * @ith		- array of percentile numbers, with space for min/max/avg;
@@ -57,11 +58,11 @@ static const unsigned int __read_mostly tfw_pstats_ith[] = {
 	[TFW_PSTATS_IDX_P99] = 99,
 };
 
-void *tfw_apm_create(void);
-void tfw_apm_destroy(void *data);
-void tfw_apm_update(void *data, unsigned long jtstamp, unsigned long jrtime);
-int tfw_apm_stats(void *data, TfwPrcntlStats *pstats);
-int tfw_apm_stats_bh(void *data, TfwPrcntlStats *pstats);
+int tfw_apm_add_srv(TfwServer *srv);
+void tfw_apm_del_srv(TfwServer *srv);
+void tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtime);
+int tfw_apm_stats(void *apmref, TfwPrcntlStats *pstats);
+int tfw_apm_stats_bh(void *apmref, TfwPrcntlStats *pstats);
 int tfw_apm_pstats_verify(TfwPrcntlStats *pstats);
 
 #endif /* __TFW_APM_H__ */
diff --git a/tempesta_fw/http.c b/tempesta_fw/http.c
index 229c3ff85..6fb1955fe 100644
--- a/tempesta_fw/http.c
+++ b/tempesta_fw/http.c
@@ -2178,7 +2178,7 @@ tfw_http_resp_cache_cb(TfwHttpReq *req, TfwHttpResp *resp)
 	 * value of RTT has an upper boundary in the APM.
 	 */
 	if (resp->conn)
-		tfw_apm_update(((TfwServer *)resp->conn->peer)->apm,
+		tfw_apm_update(((TfwServer *)resp->conn->peer)->apmref,
 				resp->jrxtstamp,
 				resp->jrxtstamp - req->jtxtstamp);
 	tfw_http_resp_fwd(req, resp);
diff --git a/tempesta_fw/procfs.c b/tempesta_fw/procfs.c
index 8b943c436..561bda9b1 100644
--- a/tempesta_fw/procfs.c
+++ b/tempesta_fw/procfs.c
@@ -156,7 +156,7 @@ tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 		.psz = ARRAY_SIZE(tfw_pstats_ith)
 	};
 
-	tfw_apm_stats_bh(srv->apm, &pstats);
+	tfw_apm_stats_bh(srv->apmref, &pstats);
 
 	SPRNE("Minimal response time\t\t", pstats.val[TFW_PSTATS_IDX_MIN]);
 	SPRNE("Average response time\t\t", pstats.val[TFW_PSTATS_IDX_AVG]);
diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 559d897e8..a13f251e3 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -448,7 +448,7 @@ tfw_sched_ratio_fill_apmdata(TfwRatioPool *rpool, TfwRatio *ratio)
 	 */
 	for (si = 0; si < rpool->srv_n; ++si) {
 		pstats.seq = srvdesc[si].seq;
-		recalc |= tfw_apm_stats(srvdesc[si].srv->apm, &pstats);
+		recalc |= tfw_apm_stats(srvdesc[si].srv->apmref, &pstats);
 		srvdesc[si].seq = pstats.seq;
 
 		srvdata[si].sdidx = si;
diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c
index 3fdcd193f..4d4e8a119 100644
--- a/tempesta_fw/server.c
+++ b/tempesta_fw/server.c
@@ -47,7 +47,7 @@ tfw_server_destroy(TfwServer *srv)
 	/* Close all connections before freeing the server! */
 	BUG_ON(!list_empty(&srv->conn_list));
 
-	tfw_apm_destroy(srv->apm);
+	tfw_apm_del_srv(srv);
 	kmem_cache_free(srv_cache, srv);
 }
 
@@ -64,16 +64,6 @@ tfw_server_create(const TfwAddr *addr)
 	return srv;
 }
 
-int
-tfw_server_apm_create(TfwServer *srv)
-{
-	BUG_ON(!srv);
-
-	if (!(srv->apm = tfw_apm_create()))
-		return -ENOMEM;
-	return 0;
-}
-
 /*
  * Look up Server Group by name, and return it to caller.
  *
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index 325560799..7bd283de7 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -43,7 +43,7 @@ typedef struct tfw_scheduler_t TfwScheduler;
  * @list	- member pointer in the list of servers of a server group;
  * @sg		- back-reference to the server group;
  * @sched_data	- private scheduler data for the server;
- * @apm		- opaque handle for APM stats;
+ * @apmref	- opaque handle for APM stats;
  * @weight	- static server weight for load balancers;
  * @conn_n	- configured number of connections to the server;
  */
@@ -52,7 +52,7 @@ typedef struct {
 	struct list_head	list;
 	TfwSrvGroup		*sg;
 	void			*sched_data;
-	void			*apm;
+	void			*apmref;
 	unsigned int		weight;
 	size_t			conn_n;
 } TfwServer;
@@ -153,7 +153,6 @@ struct tfw_scheduler_t {
 
 /* Server specific routines. */
 TfwServer *tfw_server_create(const TfwAddr *addr);
-int tfw_server_apm_create(TfwServer *srv);
 void tfw_server_destroy(TfwServer *srv);
 
 void tfw_srv_conn_release(TfwSrvConn *srv_conn);
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index aea60c32d..2813e80c8 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -1298,7 +1298,7 @@ tfw_sock_srv_start(void)
 	 * has been processed as it depends on configuration directives
 	 * that can be located anywhere in the configuration file.
 	 */
-	if ((ret = tfw_sg_for_each_srv(tfw_server_apm_create)) != 0)
+	if ((ret = tfw_sg_for_each_srv(tfw_apm_add_srv)) != 0)
 		return ret;
 
 	return tfw_sg_for_each_srv(tfw_sock_srv_connect_srv);

From a163fcc1ea1080df68200eaa4c9dc5c37ead24fd Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Tue, 2 May 2017 15:49:03 +0300
Subject: [PATCH 26/37] Address code review issues raised by @ikoveshnikov.

---
 etc/tempesta_fw.conf                  | 56 ++++++++++++++++++++++-----
 tempesta_fw/apm.c                     | 10 ++---
 tempesta_fw/server.h                  |  2 +-
 tempesta_fw/t/unit/sched_helper.c     | 24 ++++--------
 tempesta_fw/t/unit/sched_helper.h     |  2 +-
 tempesta_fw/t/unit/test_sched_hash.c  | 12 ++----
 tempesta_fw/t/unit/test_sched_http.c  | 39 +++++++------------
 tempesta_fw/t/unit/test_sched_ratio.c | 18 ++-------
 8 files changed, 83 insertions(+), 80 deletions(-)

diff --git a/etc/tempesta_fw.conf b/etc/tempesta_fw.conf
index f469d6a5c..aa9f80a39 100644
--- a/etc/tempesta_fw.conf
+++ b/etc/tempesta_fw.conf
@@ -8,17 +8,49 @@
 # a group.
 #
 # Syntax:
-#   sched SCHED_NAME;
+#   sched SCHED_NAME [OPTIONS];
 #
 # SCHED_NAME is a name of a scheduler module that distributes the load
 # among servers within a group. There are two schedulers available:
-#   - "round-robin" (default) - rotates all servers in the group in
-#     the round-robin manner, so requests are distributed uniformly across
-#     servers.
+#   - "ratio" (default) - Balances the load across servers in a group based
+#     on each server's weight. Requests are forwarded more to servers with
+#     more weight, and less to servers with less weight. As a result, each
+#     server in a group receives an optimal load. In default configuration
+#     where weights are not specified, servers weights are considered equal,
+#     and the scheduler works in pure round-robin fashion.
 #   - "hash" - chooses a server based on a URI/Host hash of a request.
 #     Requests are still distributed uniformly, but a request with the same
 #     URI/Host is always sent to the same server.
 #
+# OPTIONS are optional. Not all schedulers have additional options.
+#
+# "ratio" scheduler may have the following options:
+#   - static - The weight of each server in a group is defined statically
+#     with [weight=<NN>] option of the `server` directive. This is the
+#     default Ratio scheduler option.
+#   - dynamic - The weight of each server in a group is defined dynamically.
+#     Specific type of dynamic weight is specified with additional options:
+#       - minimum - The current minimum response time from a server;
+#       - maximum - The current maximum response time from a server;
+#       - average - The current average response time from a server;
+#       - percentile [<NN>] - The current response time from a server
+#         that is within specified percentile. The percentile may be
+#         one of 50, 75, 90, 95, 99. If none is given, then the default
+#         percentile of 90 is used.
+#         If a specific type of dynamic weight is not specified, then
+#         the default type of "average" is used.
+#   - predict - The weight of each server in a group is predicted dynamically
+#     for a time in the future, based on server's behavior in the past.
+#     Additional options include those that are defined for "dynamic" weight,
+#     as well as the following options:
+#       - past - Period of time (in seconds) to keep past response time
+#         values from a server. The default value is 30 seconds.
+#       - rate - Rate (times per second) of retrieval of past response time
+#         values. The default value is 20 times per second.
+#       - ahead - Period of time (in seconds) for which to make a prediction;
+#         It can't be more than half of **past**. The default value is 15
+#         seconds.
+#
 # Note that there's also the HTTP scheduler. It dispatches requests among
 # server groups only. Round-robin or hash scheduler must be used to select
 # a server within a group.
@@ -30,7 +62,7 @@
 # the `sched` directive.
 #
 # Default:
-#   sched round-robin;
+#   sched ratio;
 #
 
 # TAG: server.
@@ -38,17 +70,23 @@
 # Specifies an IP address/port of a back-end HTTP server.
 #
 # Syntax:
-#   server IPADDR[:PORT] [conns_n=N]
+#   server IPADDR[:PORT] [conns_n=N] [weight=N];
 #
 # IPADDR may be either IPv4 or IPv6 address, hostnames are not allowed.
 # IPv6 address must be enclosed in square brackets (e.g. "[::0]" but not "::0").
 # PORT defaults to 80 if not set.
 #
 # conns_n=N is the number of parallel connections to the server.
-# The N defaults to 32 if not set.
+# The N defaults to 32 if the option is not specified.
+#
+# weight=N is the static weight of the server. The weight must be in
+# the range of 1 to 100. If not specified, then the default weight of 50
+# is used with the static ratio scheduler. Just the weight that differs
+# from default value may be specified for convenience.
+#
 #
 # Multiple back-end servers may be specified, for example:
-#   server 10.1.0.1:80
+#   server 10.1.0.1:80;
 #   server [fc00::1]:80;
 #
 # Default:
@@ -60,7 +98,7 @@
 # Defines a request that is considered non-idempotent.
 #
 # Syntax:
-#   nonidempotent <METHOD> <OP> <string>
+#   nonidempotent <METHOD> <OP> <string>;
 #
 # <METHOD> is one of supported HTTP methods, such as GET, HEAD, POST, etc.
 # <OP> is a string matching operator, one of "eq", "prefix", "suffix", or "*".
diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 373fca9cc..fdd40b0f0 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -500,12 +500,12 @@ typedef struct {
 static DEFINE_PER_CPU(TfwRBQueue, tfw_apm_wq);
 
 /*
- * @tfw_apm_qcalc       - List of servers that require stats calculation.
- * @tfw_apm_qrecalc     - List of servers that require stats re-calculation.
- * @tfw_apm_rearm       - Atomic flag, tells if the timer needs re-arming.
- * @tfw_apm_timer       - The periodic timer handle.
+ * @tfw_apm_qcalc	- List of servers that require stats calculation.
+ * @tfw_apm_qrecalc	- List of servers that require stats re-calculation.
+ * @tfw_apm_rearm	- Atomic flag, tells if the timer needs re-arming.
+ * @tfw_apm_timer	- The periodic timer handle.
  */
-#define TFW_APM_DATA_F_REARM    (0x0001)        /* Re-arm the timer. */
+#define TFW_APM_DATA_F_REARM	(0x0001)	/* Re-arm the timer. */
 
 static struct list_head tfw_apm_qcalc;
 static struct list_head tfw_apm_qrecalc;
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index 7bd283de7..c9a4b22b9 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -83,7 +83,7 @@ struct tfw_srv_group_t {
 	rwlock_t		lock;
 	TfwScheduler		*sched;
 	void			*sched_data;
-	int			srv_n;
+	size_t			srv_n;
 	unsigned int		max_qsize;
 	unsigned int		max_refwd;
 	unsigned long		max_jqage;
diff --git a/tempesta_fw/t/unit/sched_helper.c b/tempesta_fw/t/unit/sched_helper.c
index 1d1c7e079..152433776 100644
--- a/tempesta_fw/t/unit/sched_helper.c
+++ b/tempesta_fw/t/unit/sched_helper.c
@@ -66,13 +66,14 @@ test_create_sg(const char *name)
 }
 
 void
-test_start_sg(TfwSrvGroup *sg, const char *sched_name)
+test_start_sg(TfwSrvGroup *sg, const char *sched_name, unsigned int flags)
 {
 	kernel_fpu_end();
 
 	{
 		int r = tfw_sg_set_sched(sg, sched_name);
 		BUG_ON(r);
+		sg->flags = flags;
 	}
 
 	kernel_fpu_begin();
@@ -167,8 +168,7 @@ test_sched_sg_empty_sg(struct TestSchedHelper *sched_helper)
 	BUG_ON(!sched_helper->free_sched_arg);
 
 	sg = test_create_sg("test");
-	sg->flags = sched_helper->flags;
-	test_start_sg(sg, sched_helper->sched);
+	test_start_sg(sg, sched_helper->sched, sched_helper->flags);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -199,8 +199,7 @@ test_sched_sg_one_srv_zero_conn(struct TestSchedHelper *sched_helper)
 
 	sg = test_create_sg("test");
 	test_create_srv("127.0.0.1", sg);
-	sg->flags = sched_helper->flags;
-	test_start_sg(sg, sched_helper->sched);
+	test_start_sg(sg, sched_helper->sched, sched_helper->flags);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -234,9 +233,7 @@ test_sched_sg_max_srv_zero_conn(struct TestSchedHelper *sched_helper)
 
 	for (j = 0; j < TFW_TEST_SG_MAX_SRV_N; ++j)
 		test_create_srv("127.0.0.1", sg);
-
-	sg->flags = sched_helper->flags;
-	test_start_sg(sg, sched_helper->sched);
+	test_start_sg(sg, sched_helper->sched, sched_helper->flags);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -279,8 +276,7 @@ test_sched_srv_one_srv_zero_conn(struct TestSchedHelper *sched_helper)
 
 	sg = test_create_sg("test");
 	srv = test_create_srv("127.0.0.1", sg);
-	sg->flags = sched_helper->flags;
-	test_start_sg(sg, sched_helper->sched);
+	test_start_sg(sg, sched_helper->sched, sched_helper->flags);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -313,9 +309,7 @@ test_sched_srv_max_srv_zero_conn(struct TestSchedHelper *sched_helper)
 
 	for (j = 0; j < TFW_TEST_SG_MAX_SRV_N; ++j)
 		test_create_srv("127.0.0.1", sg);
-
-	sg->flags = sched_helper->flags;
-	test_start_sg(sg, sched_helper->sched);
+	test_start_sg(sg, sched_helper->sched, sched_helper->flags);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
@@ -377,9 +371,7 @@ test_sched_srv_offline_srv(struct TestSchedHelper *sched_helper)
 			break;
 		}
 	}
-
-	sg->flags = sched_helper->flags;
-	test_start_sg(sg, sched_helper->sched);
+	test_start_sg(sg, sched_helper->sched, sched_helper->flags);
 
 	for (i = 0; i < sched_helper->conn_types; ++i) {
 		TfwMsg *msg = sched_helper->get_sched_arg(i);
diff --git a/tempesta_fw/t/unit/sched_helper.h b/tempesta_fw/t/unit/sched_helper.h
index f39d8ae82..cdc179513 100644
--- a/tempesta_fw/t/unit/sched_helper.h
+++ b/tempesta_fw/t/unit/sched_helper.h
@@ -37,7 +37,7 @@ void sched_helper_init(void);
 
 void test_spec_cleanup(TfwCfgSpec specs[]);
 TfwSrvGroup *test_create_sg(const char *name);
-void test_start_sg(TfwSrvGroup *sg, const char *sched_name);
+void test_start_sg(TfwSrvGroup *sg, const char *sched_name, unsigned int flags);
 void test_sg_release_all(void);
 
 TfwServer *test_create_srv(const char *in_addr, TfwSrvGroup *sg);
diff --git a/tempesta_fw/t/unit/test_sched_hash.c b/tempesta_fw/t/unit/test_sched_hash.c
index ada7528c2..f4e94d2a3 100644
--- a/tempesta_fw/t/unit/test_sched_hash.c
+++ b/tempesta_fw/t/unit/test_sched_hash.c
@@ -102,8 +102,7 @@ TEST(tfw_sched_hash, sched_sg_one_srv_max_conn)
 
 	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i)
 		test_create_srv_conn(srv);
-
-	test_start_sg(sg, sched_helper_hash.sched);
+	test_start_sg(sg, sched_helper_hash.sched, 0);
 
 	/* Check that every request is scheduled to the same connection. */
 	for (i = 0; i < sched_helper_hash.conn_types; ++i) {
@@ -155,8 +154,7 @@ TEST(tfw_sched_hash, sched_sg_max_srv_max_conn)
 		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j)
 			test_create_srv_conn(srv);
 	}
-
-	test_start_sg(sg, sched_helper_hash.sched);
+	test_start_sg(sg, sched_helper_hash.sched, 0);
 
 	/* Check that every request is scheduled to the same connection. */
 	for (i = 0; i < sched_helper_hash.conn_types; ++i) {
@@ -205,8 +203,7 @@ TEST(tfw_sched_hash, sched_srv_one_srv_max_conn)
 
 	for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i)
 		test_create_srv_conn(srv);
-
-	test_start_sg(sg, sched_helper_hash.sched);
+	test_start_sg(sg, sched_helper_hash.sched, 0);
 
 	/* Check that every request is scheduled to the same connection. */
 	for (i = 0; i < sched_helper_hash.conn_types; ++i) {
@@ -260,8 +257,7 @@ TEST(tfw_sched_hash, sched_srv_max_srv_max_conn)
 		for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j)
 			test_create_srv_conn(srv);
 	}
-
-	test_start_sg(sg, sched_helper_hash.sched);
+	test_start_sg(sg, sched_helper_hash.sched, 0);
 
 	/* Check that every request is scheduled to the same connection. */
 	for (i = 0; i < sched_helper_hash.conn_types; ++i) {
diff --git a/tempesta_fw/t/unit/test_sched_http.c b/tempesta_fw/t/unit/test_sched_http.c
index 6d0fa8538..4e87cff01 100644
--- a/tempesta_fw/t/unit/test_sched_http.c
+++ b/tempesta_fw/t/unit/test_sched_http.c
@@ -111,8 +111,7 @@ TEST(tfw_sched_http, zero_rules_and_zero_conns)
 TEST(tfw_sched_http, one_rule_and_zero_conns)
 {
 	TfwSrvGroup *sg = test_create_sg("default");
-	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg, "ratio");
+	test_start_sg(sg, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) {
 		TEST_FAIL("can't parse rules\n");
@@ -133,8 +132,7 @@ TEST(tfw_sched_http, one_wildcard_rule)
 	sg = test_create_sg("default");
 	srv = test_create_srv("127.0.0.1", sg);
 	expect_conn = test_create_srv_conn(srv);
-	sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg, "ratio");
+	test_start_sg(sg, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) {
 		TEST_FAIL("can't parse rules\n");
@@ -159,62 +157,52 @@ TEST(tfw_sched_http, some_rules)
 	sg1 = test_create_sg("sg1");
 	srv = test_create_srv("127.0.0.1", sg1);
 	expect_conn1 = test_create_srv_conn(srv);
-	sg1->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg1, "ratio");
+	test_start_sg(sg1, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg2 = test_create_sg("sg2");
 	srv = test_create_srv("127.0.0.1", sg2);
 	expect_conn2 = test_create_srv_conn(srv);
-	sg2->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg2, "ratio");
+	test_start_sg(sg2, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg3 = test_create_sg("sg3");
 	srv = test_create_srv("127.0.0.1", sg3);
 	expect_conn3 = test_create_srv_conn(srv);
-	sg3->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg3, "ratio");
+	test_start_sg(sg3, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg4 = test_create_sg("sg4");
 	srv = test_create_srv("127.0.0.1", sg4);
 	expect_conn4 = test_create_srv_conn(srv);
-	sg4->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg4, "ratio");
+	test_start_sg(sg4, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg5 = test_create_sg("sg5");
 	srv = test_create_srv("127.0.0.1", sg5);
 	expect_conn5 = test_create_srv_conn(srv);
-	sg5->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg5, "ratio");
+	test_start_sg(sg5, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg6 = test_create_sg("sg6");
 	srv = test_create_srv("127.0.0.1", sg6);
 	expect_conn6 = test_create_srv_conn(srv);
-	sg6->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg6, "ratio");
+	test_start_sg(sg6, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg7 = test_create_sg("sg7");
 	srv = test_create_srv("127.0.0.1", sg7);
 	expect_conn7 = test_create_srv_conn(srv);
-	sg7->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg7, "ratio");
+	test_start_sg(sg7, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg8 = test_create_sg("sg8");
 	srv = test_create_srv("127.0.0.1", sg8);
 	expect_conn8 = test_create_srv_conn(srv);
-	sg8->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg8, "ratio");
+	test_start_sg(sg8, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg9 = test_create_sg("sg9");
 	srv = test_create_srv("127.0.0.1", sg9);
 	expect_conn9 = test_create_srv_conn(srv);
-	sg9->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg9, "ratio");
+	test_start_sg(sg9, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	sg10 = test_create_sg("sg10");
 	srv = test_create_srv("127.0.0.1", sg10);
 	expect_conn10 = test_create_srv_conn(srv);
-	sg10->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-	test_start_sg(sg10, "ratio");
+	test_start_sg(sg10, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 	if (parse_cfg("sched_http_rules {\nmatch sg1 uri eq /foo;\n\
 	                                   match sg2 uri prefix /foo/bar;\n\
@@ -329,8 +317,7 @@ TEST(tfw_sched_http, one_rule)
 		sg = test_create_sg("default");
 		srv = test_create_srv("127.0.0.1", sg);
 		expect_conn = test_create_srv_conn(srv);
-		sg->flags = TFW_SG_F_SCHED_RATIO_STATIC;
-		test_start_sg(sg, "ratio");
+		test_start_sg(sg, "ratio", TFW_SG_F_SCHED_RATIO_STATIC);
 
 		if (parse_cfg(test_cases[i].rule_str)) {
 			TEST_FAIL("can't parse rules\n");
diff --git a/tempesta_fw/t/unit/test_sched_ratio.c b/tempesta_fw/t/unit/test_sched_ratio.c
index 14dfd5d84..21f6ea7eb 100644
--- a/tempesta_fw/t/unit/test_sched_ratio.c
+++ b/tempesta_fw/t/unit/test_sched_ratio.c
@@ -86,9 +86,7 @@ TEST(tfw_sched_ratio, sched_sg_one_srv_max_conn)
 		srv_conn = test_create_srv_conn(srv);
 		conn_acc ^= (long long)srv_conn;
 	}
-
-	sg->flags = sched_helper_ratio.flags;
-	test_start_sg(sg, sched_helper_ratio.sched);
+	test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags);
 
 	/*
 	 * Check that connections are scheduled in fair way:
@@ -115,7 +113,6 @@ TEST(tfw_sched_ratio, sched_sg_one_srv_max_conn)
 			kernel_fpu_begin();
 		}
 
-		sched_helper_ratio.free_sched_arg(msg);
 		EXPECT_EQ(conn_acc, conn_acc_check);
 		sched_helper_ratio.free_sched_arg(msg);
 	}
@@ -146,9 +143,7 @@ TEST(tfw_sched_ratio, sched_sg_max_srv_max_conn)
 			conn_acc ^= (long long)srv_conn;
 		}
 	}
-
-	sg->flags = sched_helper_ratio.flags;
-	test_start_sg(sg, sched_helper_ratio.sched);
+	test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags);
 
 	/*
 	 * Check that connections are scheduled in fair way:
@@ -168,7 +163,6 @@ TEST(tfw_sched_ratio, sched_sg_max_srv_max_conn)
 			tfw_srv_conn_put(srv_conn);
 		}
 
-		sched_helper_ratio.free_sched_arg(msg);
 		EXPECT_EQ(conn_acc, conn_acc_check);
 		sched_helper_ratio.free_sched_arg(msg);
 	}
@@ -195,9 +189,7 @@ TEST(tfw_sched_ratio, sched_srv_one_srv_max_conn)
 		srv_conn = test_create_srv_conn(srv);
 		conn_acc ^= (long long)srv_conn;
 	}
-
-	sg->flags = sched_helper_ratio.flags;
-	test_start_sg(sg, sched_helper_ratio.sched);
+	test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags);
 
 	/*
 	 * Check that connections are scheduled in fair way:
@@ -261,9 +253,7 @@ TEST(tfw_sched_ratio, sched_srv_max_srv_max_conn)
 			srv_acc[i].conn_acc ^= (long long)srv_conn;
 		}
 	}
-
-	sg->flags = sched_helper_ratio.flags;
-	test_start_sg(sg, sched_helper_ratio.sched);
+	test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags);
 
 	/*
 	 * Check that connections are scheduled in fair way:

From 8ee11deab5b575feb7dbe7bf4c592c1d458fecf6 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Wed, 3 May 2017 18:37:10 +0300
Subject: [PATCH 27/37] Correctly release server's APM data on cleanup.

---
 tempesta_fw/apm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index fdd40b0f0..40b21c274 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -1084,10 +1084,11 @@ tfw_apm_add_srv(TfwServer *srv)
 void
 tfw_apm_del_srv(TfwServer *srv)
 {
-	TfwApmData *data = srv->apmref;
+	if (!srv->apmref)
+		return;
 
+	tfw_apm_data_put(srv->apmref);
 	srv->apmref = NULL;
-	tfw_apm_data_put(data);
 }
 
 #define TFW_APM_MIN_TMWSCALE	1	/* Minimum time window scale. */

From e8473a0b38aedf53dc239f5fbeb1931c619e56e3 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 5 May 2017 15:32:48 +0300
Subject: [PATCH 28/37] Fix schedulers unit tests - set sg->flags before sched
 registration.

---
 tempesta_fw/t/unit/sched_helper.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tempesta_fw/t/unit/sched_helper.c b/tempesta_fw/t/unit/sched_helper.c
index 152433776..f22bff929 100644
--- a/tempesta_fw/t/unit/sched_helper.c
+++ b/tempesta_fw/t/unit/sched_helper.c
@@ -68,13 +68,13 @@ test_create_sg(const char *name)
 void
 test_start_sg(TfwSrvGroup *sg, const char *sched_name, unsigned int flags)
 {
+	int r;
+
 	kernel_fpu_end();
 
-	{
-		int r = tfw_sg_set_sched(sg, sched_name);
-		BUG_ON(r);
-		sg->flags = flags;
-	}
+	sg->flags = flags;
+	r = tfw_sg_set_sched(sg, sched_name);
+	BUG_ON(r);
 
 	kernel_fpu_begin();
 }

From bf850b88d5d5849f501949474e2cb28c2890740b Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 15 May 2017 16:19:44 +0300
Subject: [PATCH 29/37] Multiple minor changes to fix issues raised in code
 reviews.

---
 tempesta_fw/apm.c                           |   2 +-
 tempesta_fw/sched/tfw_sched_hash.c          |  81 ++-
 tempesta_fw/sched/tfw_sched_ratio.c         | 558 +++++++++++---------
 tempesta_fw/server.c                        |   5 +
 tempesta_fw/sock_srv.c                      |   6 +-
 tempesta_fw/t/unit/user_space/percentiles.c |  17 +-
 6 files changed, 346 insertions(+), 323 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 40b21c274..78d2b3956 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -918,7 +918,7 @@ tfw_apm_prcntl_tmfn(unsigned long fndata)
 	 * Process work queues on all CPUs and update stats with data
 	 * from each work item in the queue. Add servers with updated
 	 * stats to the list for calculation of stats. Each server is
-	 * is added to the list just once.
+	 * added to the list just once.
 	 *
 	 * If server's APM data is already on the list, that means it
 	 * is on @qrecalc list. Just remove it from @qrecalc list and
diff --git a/tempesta_fw/sched/tfw_sched_hash.c b/tempesta_fw/sched/tfw_sched_hash.c
index 083c75f16..19b592cbf 100644
--- a/tempesta_fw/sched/tfw_sched_hash.c
+++ b/tempesta_fw/sched/tfw_sched_hash.c
@@ -201,7 +201,7 @@ tfw_sched_hash_get_srv_conn(TfwMsg *msg, TfwServer *srv)
 }
 
 static void
-tfw_sched_hash_cleanup(TfwSrvGroup *sg)
+tfw_sched_hash_del_grp(TfwSrvGroup *sg)
 {
 	size_t si;
 	TfwHashSrvList *sl = sg->sched_data;
@@ -209,62 +209,24 @@ tfw_sched_hash_cleanup(TfwSrvGroup *sg)
 	if (!sl)
 		return;
 
-	for (si = 0; si < sl->srv_n; ++si) {
+	for (si = 0; si < sl->srv_n; ++si)
 		if (sl->srvs[si].conn)
 			kfree(sl->srvs[si].conn);
-		if (sl->srvs[si].hash)
-			kfree(sl->srvs[si].hash);
-	}
-
 	kfree(sl);
 	sg->sched_data = NULL;
 }
 
-static void
-tfw_sched_hash_del_grp(TfwSrvGroup *sg)
-{
-	tfw_sched_hash_cleanup(sg);
-}
-
-/**
- * Validate the integrity of a group.
- *
- * Make sure that number of servers in the group, and the number
- * of connections for each server match the recorded values.
- */
-static int
-tfw_sched_hash_validate_grp(TfwSrvGroup *sg)
-{
-	size_t si = 0, ci;
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		ci = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			++ci;
-		if (ci > srv->conn_n)
-			return -EINVAL;
-		++si;
-	}
-	if (si > sg->srv_n)
-		return -EINVAL;
-
-	return 0;
-}
-
 static int
 tfw_sched_hash_add_grp(TfwSrvGroup *sg)
 {
-	int ret = -ENOMEM;
-	size_t size, ci;
+	int ret = -EINVAL;
+	size_t size, si, ci;
 	unsigned int sum_conn_n;
 	TfwServer *srv;
-	TfwSrvConn *srv_conn;
 	TfwHashSrv *hsrv;
 	TfwHashSrvList *sl;
 
-	if (tfw_sched_hash_validate_grp(sg))
+	if (unlikely(!sg->srv_n || list_empty(&sg->srv_list)))
 		return -EINVAL;
 
 	size = sizeof(TfwHashSrvList) + sizeof(TfwHashSrv) * sg->srv_n;
@@ -274,32 +236,49 @@ tfw_sched_hash_add_grp(TfwSrvGroup *sg)
 	sl->srvs = sg->sched_data + sizeof(TfwHashSrvList);
 	sl->srv_n = sg->srv_n;
 
-	sum_conn_n = 0;
+	si = sum_conn_n = 0;
 	hsrv = sl->srvs;
 	list_for_each_entry(srv, &sg->srv_list, list) {
-		size = sizeof(hsrv->conn[0]) * srv->conn_n;
-		if (!(hsrv->conn = kzalloc(size, GFP_KERNEL)))
+		TfwSrvConn **conn, *srv_conn;
+		unsigned long *hash;
+
+		if (unlikely((si++ == sg->srv_n) || !srv->conn_n
+			     || list_empty(&srv->conn_list)))
 			goto cleanup;
-		size = sizeof(hsrv->hash[0]) * srv->conn_n;
-		if (!(hsrv->hash = kzalloc(size, GFP_KERNEL)))
+
+		size = (sizeof(hsrv->conn[0]) + sizeof(hsrv->hash[0]))
+		       * srv->conn_n;
+		if (!(hsrv->conn = kzalloc(size, GFP_KERNEL))) {
+			ret = -ENOMEM;
 			goto cleanup;
+		}
+		hsrv->hash = (typeof(hsrv->hash))(hsrv->conn + srv->conn_n);
+
 		ci = 0;
+		conn = hsrv->conn;
+		hash = hsrv->hash;
 		list_for_each_entry(srv_conn, &srv->conn_list, list) {
+			if (unlikely(ci++ == srv->conn_n))
+				goto cleanup;
 			++sum_conn_n;
-			hsrv->conn[ci] = srv_conn;
-			hsrv->hash[ci++] = __calc_conn_hash(srv, sum_conn_n);
+			*conn++ = srv_conn;
+			*hash++ = __calc_conn_hash(srv, sum_conn_n);
 		}
+		if (unlikely(ci != srv->conn_n))
+			goto cleanup;
 		hsrv->conn_n = srv->conn_n;
 		hsrv->srv = srv;
 		srv->sched_data = hsrv;
 		++hsrv;
 	}
+	if (unlikely(si != sg->srv_n))
+		goto cleanup;
 	sl->conn_n = sum_conn_n;
 
 	return 0;
 
 cleanup:
-	tfw_sched_hash_cleanup(sg);
+	tfw_sched_hash_del_grp(sg);
 	return ret;
 }
 
diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index a13f251e3..73b0aaeee 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -40,14 +40,14 @@ MODULE_LICENSE("GPL");
  * Only fully established connections are considered by scheduler.
  *
  * @srv		- pointer to server structure.
- * @conns	- list of pointers to server connection structures.
+ * @conn	- list of pointers to server connection structures.
  * @counter	- monotonic counter for choosing the next connection.
  * @conn_n	- number of connections to server.
  * @seq		- current sequence number for APM stats.
  */
 typedef struct {
 	TfwServer	*srv;
-	TfwSrvConn	**conns;
+	TfwSrvConn	**conn;
 	atomic64_t	counter;
 	size_t		conn_n;
 	unsigned int	seq;
@@ -94,50 +94,51 @@ typedef struct {
 /**
  * Historic (past) data unit for an individual upstream server.
  *
- * @x		- count of timer function invocations.
- * @y		- RTT from APM in msecs.
+ * @cnt		- count of timer function invocations.
+ * @rtt		- RTT from APM in msecs.
  */
 typedef struct {
-	unsigned long	x;
-	unsigned long	y;
-} TfwRatioHstXY;
+	unsigned long	cnt;
+	unsigned long	rtt;
+} TfwRatioHstUnit;
 
 /**
  * Historic (past) data set for an individual upstream server.
  * This is the data set for simple linear regression calculation.
  *
- * @a		- coefficient for y = a + b * x + eps.
- * @b		- coefficient for y = a + b * x + eps.
- * @x_avg	- average x value.
- * @y_avg	- average y value.
- * @xy_avg	- avg(x * y).
- * @x_avg_y_avg	- avg(x) * avg(y).
- * @x_sq_avg	- avg(x * x).
- * @x_avg_sq	- avg(x) * avg(x).
+ * @coeff_a		- coefficient for rtt = coeff_a + coeff_b * cnt + eps.
+ * @coeff_b		- coefficient for rtt = coeff_a + coeff_b * cnt + eps.
+ * @cnt_avg		- average cnt value.
+ * @rtt_avg		- average rtt value.
+ * @cnt_rtt_avg		- avg(cnt * rtt).
+ * @cnt_avg_rtt_avg	- avg(cnt) * avg(rtt).
+ * @cnt_sq_avg		- avg(cnt * cnt).
+ * @cnt_avg_sq		- avg(cnt) * avg(cnt).
+ * @hist		- array of history data units.
  */
 typedef struct {
-	long		a;
-	long		b;
-	long		x_avg;
-	long		y_avg;
-	long		xy_avg;
-	long		x_avg_y_avg;
-	long		x_sq_avg;
-	long		x_avg_sq;
-	TfwRatioHstXY	*hist;
+	long		coeff_a;
+	long		coeff_b;
+	long		cnt_avg;
+	long		rtt_avg;
+	long		cnt_rtt_avg;
+	long		cnt_avg_rtt_avg;
+	long		cnt_sq_avg;
+	long		cnt_avg_sq;
+	TfwRatioHstUnit	*hist;
 } TfwRatioHstDesc;
 
 /**
  * Historic (past) data for predictive scheduler.
  *
  * @ahead	- predict for this number of @intvl ahead.
- * @past_sz	- total number of slots for past data.
+ * @slot_n	- total number of slots for past data.
  * @counter	- slot that is available for storing past data.
  * @past	- past data for each server (@past[@srv_n]).
  */
 typedef struct {
 	unsigned int	ahead;
-	size_t		past_sz;
+	size_t		slot_n;
 	unsigned long	counter;
 	TfwRatioHstDesc	*past;
 } TfwRatioHstData;
@@ -208,11 +209,7 @@ tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs)
 	unsigned int lhs_ratio = ((const TfwRatioSrvData *)lhs)->oratio;
 	unsigned int rhs_ratio = ((const TfwRatioSrvData *)rhs)->oratio;
 
-	if (lhs_ratio > rhs_ratio)
-		return -1;
-	if (lhs_ratio < rhs_ratio)
-		return 1;
-	return 0;
+	return (rhs_ratio < lhs_ratio) ? -1 : (rhs_ratio > lhs_ratio);
 }
 
 /**
@@ -222,50 +219,29 @@ tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs)
  * Return a non-zero value if additional actions are needed.
  */
 static int
-tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio, size_t *arg_mvidx)
+tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio,
+		     unsigned long sum_wgt, size_t max_val_idx,
+		     size_t *arg_ovidx)
 {
-	size_t si, max_val_idx;
+	size_t si, one_val_idx;
 	unsigned int diff, max_wgt, oratio;
-	unsigned long unit, sum_wgt = 0, sum_ratio = 0;
+	unsigned long unit, sum_ratio = 0;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 	TfwRatioSchData *schdata = &ratio->schdata;
 
-	BUG_ON(!ratio);
-
-	/*
-	 * Calculate the sum of server's weights in the group. Remember
-	 * the index of server data entry with maximum weight. That same
-	 * entry will also have the maximum ratio. See if all weights in
-	 * the group are the same.
-	 */
-	diff = max_val_idx = 0;
-	for (si = 0; si < rpool->srv_n; ++si) {
-		if (srvdata[max_val_idx].weight < srvdata[si].weight)
-			max_val_idx = si;
-		sum_wgt += srvdata[si].weight;
-		diff |= (srvdata[si].weight != srvdata[0].weight);
-	}
-
 	/* Set up the common part of scheduler data. */
 	schdata->csidx = 0;
 	schdata->riter = 1;
 	schdata->reidx = rpool->srv_n;
 
 	/*
-	 * If all server weights are the same, then there's no need to do
-	 * anything else. Set up all ratios to 1 and be done with it.
-	 */
-	if (!diff) {
-		for (si = 0; si < rpool->srv_n; ++si)
-			srvdata[si].cratio = srvdata[si].oratio = 1;
-		schdata->crsum = schdata->orsum = rpool->srv_n;
-		return 0;
-	}
-
-	/*
-	 * Calculate each server's ratio using a special formula. See
-	 * if all calculated ratios are the same. Set up scheduler data.
+	 * Calculate each server's ratio using the following formula:
+	 * unit = (MAX_WEIGHT + SRV_NUM) * MAX_WEIGHT / sum(weight);
+	 * ratio[i] = unit * weight[i] / MAX_WEIGHT;
+	 *
+	 * See if all calculated ratios are the same. Set scheduler data.
 	 */
+	diff = one_val_idx = 0;
 	max_wgt = srvdata[max_val_idx].weight;
 	unit = ((max_wgt + rpool->srv_n) * max_wgt) / sum_wgt;
 	for (si = 0; si < rpool->srv_n; ++si) {
@@ -273,11 +249,13 @@ tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio, size_t *arg_mvidx)
 		srvdata[si].cratio = srvdata[si].oratio = oratio;
 		diff |= (oratio != srvdata[0].oratio);
 		sum_ratio += oratio;
+		if ((oratio == 1) && !one_val_idx)
+			one_val_idx = si;
 	}
 	schdata->crsum = schdata->orsum = sum_ratio;
 
-	/* Return the index of server data entry with maximum ratio. */
-	*arg_mvidx = max_val_idx;
+	/* Return the index of server data entry with value of 1. */
+	*arg_ovidx = one_val_idx;
 
 	return diff;
 }
@@ -289,18 +267,49 @@ tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio, size_t *arg_mvidx)
 static void
 tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
 {
-	size_t si, max_val_idx = 0;
+	unsigned long sum_wgt;
+	unsigned int diff;
+	size_t si, max_val_idx, one_val_idx;
 	TfwRatioSrvDesc *srvdesc = rpool->srvdesc;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 
-	/* Collect server weights from the configuration. */
+	/*
+	 * Collect server weights from the configuration. Calculate the
+	 * sum of server's weights in the group. Remember the index of
+	 * server data entry with maximum weight. That same entry will
+	 * also have the maximum ratio. See if all weights in the group
+	 * are the same.
+	 */
+	sum_wgt = diff = max_val_idx = 0;
 	for (si = 0; si < rpool->srv_n; ++si) {
+		unsigned int weight = srvdesc[si].srv->weight;
 		srvdata[si].sdidx = si;
-		srvdata[si].weight = srvdesc[si].srv->weight;
+		srvdata[si].weight = weight;
+		srvdata[si].cratio = srvdata[si].oratio = 1;
+		if (srvdata[max_val_idx].weight < weight)
+			max_val_idx = si;
+		sum_wgt += weight;
+		diff |= (weight != srvdata[0].weight);
+	}
+
+	/*
+	 * If all server weights are the same, then there's no need to
+	 * do anything else. Set up all ratios to 1 and be done with it.
+	 */
+	if (!diff) {
+		TfwRatioSchData *schdata = &ratio->schdata;
+
+		/* Set up the common part of scheduler data. */
+		schdata->csidx = 0;
+		schdata->riter = 1;
+		schdata->reidx = rpool->srv_n;
+
+		schdata->crsum = schdata->orsum = rpool->srv_n;
 	}
 
-	/* Calculate ratios based on server weights. */
-	if (!tfw_sched_ratio_calc(rpool, ratio, &max_val_idx))
+	/* Calculate ratios based on different weights of servers. */
+	if (!tfw_sched_ratio_calc(rpool, ratio, sum_wgt,
+				  max_val_idx, &one_val_idx))
 		return;
 
 	/* Sort server data entries by ratio in descending order. */
@@ -314,30 +323,49 @@ tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
  * Latest dynamic data is provided by APM module and represent RTT values
  * for each server in a group. Ratios are calculated on those RTT values.
  * However that way the ratios do not represent the real weight of each
- * server because a bigger RTT value mean that a server is less favorable
- * and has a lesser, NOT bigger weight.
+ * server. A bigger RTT value leads to a bigger ratio, while in fact that
+ * server is less favorable and should have a lesser, NOT bigger weight.
+ *
+ * Based on ratios calculated from RTT values, the algorithm here adjusts
+ * that and assigns a correct ratio to each server in the group.
+ *
+ * 1. If the minimal calculated ratio is 1, then find entries that have
+ *    ratio of 1, and set them up with the weight and ratio of an entry
+ *    with maximum calculated ratio. Likewise, set up entries with the
+ *    maximum calculated ratio with weight and ratio of an entry with
+ *    ratio of 1.
+ *    For example, this is after the calculation of ratios:
+ *    sdidx:   1   2   3   4   5   6   7   8   9   10
+ *    ratio:   10  5   1   30  1   25  1   60  15  50
+ *    After this step the result will be:
+ *    sdidx:   1   2   3   4   5   6   7   8   9   10
+ *    ratio:   10  5   60  30  60  25  60  1   15  50
  *
- * Based on ratios calculated from RTT values, the algorithm here assigns
- * a correct ratio to each server in the group.
- * 1. If the minimal ratio is 1, then fill the entries with minimal ratio
- *    with values from an entry with the maximum ratio. Fill the entries
- *    with maximum ratio with values from an entry with minimal ratio.
  * 2. Sort the resulting array by ratio in descending order as required
- *    by the scheduling algorithm.
+ *    by the scheduling algorithm. The result will be as follows:
+ *    sdidx:   7   5   3   10   4   6   9   1   2   8
+ *    ratio:   60  60  60  50   30  25  15  10  5   1
+ *
  * 3. Select the part of the array that omits entries from step 1 if any.
  *    Those are entries at the start and at the end of the array. Reverse
  *    the sequence of server descriptor indices in that part of the array.
- *    The resulting pairing of servers to ratios is the target.
+ *    The resulting pairing of servers to ratios is the target. Servers
+ *    with a lesser RTT are assigned a larger ratio. Servers with a larger
+ *    RTT are assigned a lesser ratio. The result will be as follows:
+ *    sdidx:   7   5   3   2   1   9   6   4   10   8
+ *    ratio:   60  60  60  50  30  25  15  10   5   1
  */
 static void
-__tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
+__tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio,
+			       unsigned long sum_wgt, size_t max_val_idx)
 {
-	size_t si, max_val_idx = 0, left = 0, right = 0;
-	unsigned int max_ratio = 0, has_one_val = 0;
+	size_t si, one_val_idx, left, right;
+	unsigned int max_ratio, has_one_val;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 
 	/* Calculate ratios based on server RTT values. */
-	if (!tfw_sched_ratio_calc(rpool, ratio, &max_val_idx))
+	if (!tfw_sched_ratio_calc(rpool, ratio, sum_wgt,
+				  max_val_idx, &one_val_idx))
 		return;
 
 	/*
@@ -346,15 +374,11 @@ __tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
 	 * do actions described in step 1 in the function's description.
 	 * Adjust the sum of ratios that is changed in this procedure.
 	 */
-	for (si = 0; si < rpool->srv_n; ++si) {
-		if (srvdata[si].oratio == 1) {
-			has_one_val = 1;
-			break;
-		}
-	}
+	has_one_val = (srvdata[one_val_idx].oratio == 1);
+
 	if (has_one_val) {
 		unsigned long orsum = ratio->schdata.orsum;
-		TfwRatioSrvData sdent_one = srvdata[si];
+		TfwRatioSrvData sdent_one = srvdata[one_val_idx];
 		TfwRatioSrvData sdent_max = srvdata[max_val_idx];
 
 		/* Save maximum ratio value for future use. */
@@ -409,16 +433,31 @@ __tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
 }
 
 /**
- * Fill scheduler's ratio entry with APM data for each server.
+ * Get specific server's data (RTT) from the APM module.
+ *
+ * While all stats values are returned by the APM, only one specific
+ * value is taken as the current RTT. That is the configured value,
+ * one of MIN, MAX, AVG, or a specific percentile.
  *
  * Return 0 if there is no new APM data.
  * Return a non-zero value otherwise.
+ *
+ * TODO: The following cases should be considered.
+ * 1. It's possible that the actual stats values calculated by the APM
+ *    module did not change. However, the APM doesn't know of that and
+ *    just reports that the values may have changed. It would be great
+ *    to catch that and avoid the recalculation of ratios in some cases.
+ * 2. Depending on specific RTT value a small deviation from the previous
+ *    value might be acceptable. That should not cause a recalculation
+ *    of ratio.
+ * 3. A typical case is that only a handful of servers misbehave in
+ *    a large group of servers. Is there a way to detect that and do
+ *    a partial recalculation of ratios?
  */
-static int
-tfw_sched_ratio_fill_apmdata(TfwRatioPool *rpool, TfwRatio *ratio)
+static inline int
+__tfw_sched_ratio_get_rtt(size_t si, TfwRatioPool *rpool, TfwRatio *ratio)
 {
-	size_t si;
-	unsigned int recalc = 0;
+	unsigned int recalc;
 	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
 	TfwPrcntlStats pstats = {
 		.ith = tfw_pstats_ith,
@@ -428,32 +467,12 @@ tfw_sched_ratio_fill_apmdata(TfwRatioPool *rpool, TfwRatio *ratio)
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 	TfwRatioSrvDesc *srvdesc = rpool->srvdesc;
 
-	/*
-	 * Collect server RTT values from APM module. See if APM may have
-	 * provided new data, and a recalculation is required. Otherwise
-	 * there's nothing to do.
-	 *
-	 * TODO: The following cases should be considered.
-	 * 1. APM recalculates the stats on each request-response pair.
-	 *    It's quite possible that the actual stats values did not
-	 *    change. However, the APM doesn't know of that and reports
-	 *    that the values may have changed. It would be great to
-	 *    catch that and avoid the recalculation of ratios.
-	 * 2. Depending on actual RTT values a small deviation from the
-	 *    previous value should be acceptable. It should not cause
-	 *    a recalculation of ratio.
-	 * 3. Finally, a typical case is that only a handful of servers
-	 *    misbehave in a large group of servers. Is there a way to
-	 *    detect that and do a partial recalculation of ratios?
-	 */
-	for (si = 0; si < rpool->srv_n; ++si) {
-		pstats.seq = srvdesc[si].seq;
-		recalc |= tfw_apm_stats(srvdesc[si].srv->apmref, &pstats);
-		srvdesc[si].seq = pstats.seq;
+	pstats.seq = srvdesc[si].seq;
+	recalc = tfw_apm_stats(srvdesc[si].srv->apmref, &pstats);
+	srvdesc[si].seq = pstats.seq;
 
-		srvdata[si].sdidx = si;
-		srvdata[si].weight = pstats.val[rpool->psidx] ? : 1;
-	}
+	srvdata[si].sdidx = si;
+	srvdata[si].weight = pstats.val[rpool->psidx] ? : 1;
 
 	return recalc;
 }
@@ -465,17 +484,27 @@ tfw_sched_ratio_fill_apmdata(TfwRatioPool *rpool, TfwRatio *ratio)
  *
  * The function runs periodically on timer and provides the data that
  * is used by the ratio scheduler for outgoing requests.
- *
- * Return 0 if there are no new ratio values.
- * Return a non-zero value if new ratio values were calculated.
  */
-static int
+static void
 tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
 {
-	if (!tfw_sched_ratio_fill_apmdata(rpool, ratio))
-		return 0;
-	__tfw_sched_ratio_calc_dynamic(rpool, ratio);
-	return 1;
+	size_t si, max_val_idx = 0;
+	unsigned long sum_wgt = 0;
+	TfwRatioSrvData *srvdata = ratio->srvdata;
+
+	/*
+	 * Calculate the sum of server's weights in the group. Remember
+	 * the index of server data entry with maximum weight. That same
+	 * entry will also have the maximum ratio.
+	 */
+	for (si = 0; si < rpool->srv_n; ++si) {
+		__tfw_sched_ratio_get_rtt(si, rpool, ratio);
+		if (srvdata[max_val_idx].weight < srvdata[si].weight)
+			max_val_idx = si;
+		sum_wgt += srvdata[si].weight;
+	}
+
+	__tfw_sched_ratio_calc_dynamic(rpool, ratio, sum_wgt, max_val_idx);
 }
 
 /**
@@ -488,80 +517,99 @@ tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
  * predicted RTT values.
  *
  * A simple linear regression calculation on a sliding data window is
- * used to predict future RTT values for each server. @y is an RTT value
- * from APM, and @x is the current number of invocations of this timer
- * function (every @intvl msecs). Essentially @x is a measure of time.
+ * used to predict future RTT values for each server. @rtt is an RTT
+ * value from APM, and @cnt is the current number of invocations of
+ * this timer function (every @intvl msecs). Essentially, @cnt is
+ * a measure of time.
+ *
+ * The POC (proof of concept) implementation of this algorithm can be
+ * found in t/unit/user_space/slr.cc. @cnt corresponds to @x in the POC,
+ * and @rtt corresponds to @y.
  *
  * The function runs periodically on timer and provides the data that
  * is used by the ratio scheduler for outgoing requests.
- *
- * Return 0 if there are no new ratio values.
- * Return a non-zero value if new ratio values were calculated.
  */
-static int
+static void
 tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
 {
 	TfwRatioHstData *hstdata = rpool->hstdata;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
 	static const long MUL = 1000;
-	unsigned long x = hstdata->counter * MUL;
-	size_t si, sz, ni;
+	unsigned long cnt, rtt, ahead, sum_wgt;
+	size_t si, sz, ni, max_val_idx;
 
-	tfw_sched_ratio_fill_apmdata(rpool, ratio);
-
-	ni = hstdata->counter % hstdata->past_sz;
+	ni = hstdata->counter % hstdata->slot_n;
+	cnt = hstdata->counter * MUL;
+	ahead = hstdata->counter + hstdata->ahead;
 
+	sum_wgt = max_val_idx = 0;
 	for (si = 0; si < rpool->srv_n; ++si) {
-		unsigned long y = srvdata[si].weight * MUL;
+		long prediction;
 		TfwRatioHstDesc *hd = &hstdata->past[si];
 
-		if (unlikely(hstdata->counter < hstdata->past_sz)) {
+		__tfw_sched_ratio_get_rtt(si, rpool, ratio);
+
+		rtt = srvdata[si].weight * MUL;
+
+		/*
+		 * The calculations are slightly different for the case
+		 * in the beginning where there's insufficient data for
+		 * a whole window into the historic data set.
+		 */
+		if (unlikely(hstdata->counter < hstdata->slot_n)) {
 			sz = ni + 1;
-			hd->x_avg = (hd->x_avg * ni + x) / sz;
-			hd->y_avg = (hd->y_avg * ni + y) / sz;
-			hd->xy_avg = (hd->xy_avg * ni + x * y) / sz;
-			hd->x_avg_y_avg = hd->x_avg * hd->y_avg;
-			hd->x_sq_avg = (hd->x_sq_avg * ni + x * x) / sz;
-			hd->x_avg_sq = hd->x_avg * hd->x_avg;
+			hd->cnt_avg = (hd->cnt_avg * ni + cnt) / sz;
+			hd->rtt_avg = (hd->rtt_avg * ni + rtt) / sz;
+			hd->cnt_rtt_avg =
+				(hd->cnt_rtt_avg * ni + cnt * rtt) / sz;
+			hd->cnt_avg_rtt_avg = hd->cnt_avg * hd->rtt_avg;
+			hd->cnt_sq_avg =
+				(hd->cnt_sq_avg * ni + cnt * cnt) / sz;
+			hd->cnt_avg_sq = hd->cnt_avg * hd->cnt_avg;
 		} else {
-			unsigned long h_x = hd->hist[ni].x;
-			unsigned long h_y = hd->hist[ni].y;
-			sz = hstdata->past_sz;
-			hd->x_avg = hd->x_avg - (h_x - x) / sz;
-			hd->y_avg = hd->y_avg - (h_y - y) / sz;
-			hd->xy_avg = hd->xy_avg - (h_x * h_y - x * y) / sz;
-			hd->x_avg_y_avg = hd->x_avg * hd->y_avg;
-			hd->x_sq_avg = hd->x_sq_avg - (h_x * h_x - x * x) / sz;
-			hd->x_avg_sq = hd->x_avg * hd->x_avg;
+			unsigned long h_cnt = hd->hist[ni].cnt;
+			unsigned long h_rtt = hd->hist[ni].rtt;
+			sz = hstdata->slot_n;
+			hd->cnt_avg = hd->cnt_avg - (h_cnt - cnt) / sz;
+			hd->rtt_avg = hd->rtt_avg - (h_rtt - rtt) / sz;
+			hd->cnt_rtt_avg = hd->cnt_rtt_avg
+					  - (h_cnt * h_rtt - cnt * rtt) / sz;
+			hd->cnt_avg_rtt_avg = hd->cnt_avg * hd->rtt_avg;
+			hd->cnt_sq_avg = hd->cnt_sq_avg
+					 - (h_cnt * h_cnt - cnt * cnt) / sz;
+			hd->cnt_avg_sq = hd->cnt_avg * hd->cnt_avg;
 		}
 
-		hd->hist[ni].x = x;
-		hd->hist[ni].y = y;
+		hd->hist[ni].cnt = cnt;
+		hd->hist[ni].rtt = rtt;
 
-		if (hd->x_sq_avg == hd->x_avg_sq) {
-			hd->a = 0;
-			hd->b = hd->x_avg ? hd->y_avg / hd->x_avg : 1;
+		if (hd->cnt_sq_avg == hd->cnt_avg_sq) {
+			hd->coeff_a = 0;
+			hd->coeff_b = hd->cnt_avg
+				    ? hd->rtt_avg / hd->cnt_avg : 1;
 		} else {
-			hd->b = (hd->xy_avg - hd->x_avg_y_avg)
-				/ (hd->x_sq_avg - hd->x_avg_sq);
-			hd->a = (hd->y_avg - hd->b * hd->x_avg) / MUL;
+			hd->coeff_b = (hd->cnt_rtt_avg - hd->cnt_avg_rtt_avg)
+				      / (hd->cnt_sq_avg - hd->cnt_avg_sq);
+			hd->coeff_a = (hd->rtt_avg - hd->coeff_b * hd->cnt_avg)
+				      / MUL;
 		}
-	}
 
-	x = hstdata->counter + hstdata->ahead;
-	for (si = 0; si < rpool->srv_n; ++si) {
-		TfwRatioHstDesc *hd = &hstdata->past[si];
-		long prediction = hd->a + hd->b * x;
+		prediction = hd->coeff_a + hd->coeff_b * ahead;
 		srvdata[si].weight = prediction <= 0 ? 1 : prediction;
+
+		if (srvdata[max_val_idx].weight < srvdata[si].weight)
+			max_val_idx = si;
+		sum_wgt += srvdata[si].weight;
 	}
+
 	++hstdata->counter;
 
-	__tfw_sched_ratio_calc_dynamic(rpool, ratio);
-	return 1;
+	__tfw_sched_ratio_calc_dynamic(rpool, ratio, sum_wgt, max_val_idx);
 }
 
 /**
  * Get a free for use entry from the RCU pool.
+ * Note that @ratio->free is always either 1 or 0.
  */
 static TfwRatio *
 tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
@@ -569,14 +617,9 @@ tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
 	int si;
 	TfwRatio *ratio = rpool->rpool;
 
-	for (si = 0; si <= nr_cpu_ids; ++si, ++ratio) {
-		smp_mb();
-		if (atomic_read(&ratio->free)) {
-			atomic_set(&ratio->free, 0);
-			smp_mb__after_atomic();
+	for (si = 0; si <= nr_cpu_ids; ++si, ++ratio)
+		if (atomic_cmpxchg(&ratio->free, 1, 0))
 			return ratio;
-		}
-	}
 
 	return NULL;
 }
@@ -588,7 +631,6 @@ static inline void
 __tfw_sched_ratio_rpool_put(TfwRatio *ratio)
 {
 	atomic_set(&ratio->free, 1);
-	smp_mb__after_atomic();
 }
 
 static void
@@ -615,7 +657,7 @@ tfw_sched_ratio_rpool_put(struct rcu_head *rcup)
  */
 static void
 tfw_sched_ratio_calc_tmfn(TfwSrvGroup *sg,
-			  int (*calc_fn)(TfwRatioPool *, TfwRatio *))
+			  void (*calc_fn)(TfwRatioPool *, TfwRatio *))
 {
 	TfwRatioPool *rpool = sg->sched_data;
 	TfwRatio *cratio, *nratio;
@@ -633,13 +675,10 @@ tfw_sched_ratio_calc_tmfn(TfwSrvGroup *sg,
 	}
 
 	/*
-	 * Calculate dynamic ratios. If there's nothing to do, then
-	 * return the ratio entry back to the RCU pool.
+	 * Calculate dynamic ratios. If there's nothing to do,
+	 * then return the ratio entry back to the RCU pool.
 	 */
-	if (!calc_fn(rpool, nratio)) {
-		__tfw_sched_ratio_rpool_put(nratio);
-		goto rearm;
-	}
+	calc_fn(rpool, nratio);
 
 	/*
 	 * Substitute the current ratio entry with the new one for
@@ -809,7 +848,7 @@ __sched_srv(TfwRatioSrvDesc *srvdesc, int skipnip, int *nipconn)
 
 	for (ci = 0; ci < srvdesc->conn_n; ++ci) {
 		unsigned long idxval = atomic64_inc_return(&srvdesc->counter);
-		TfwSrvConn *srv_conn = srvdesc->conns[idxval % srvdesc->conn_n];
+		TfwSrvConn *srv_conn = srvdesc->conn[idxval % srvdesc->conn_n];
 
 		if (unlikely(tfw_srv_conn_restricted(srv_conn)
 			     || tfw_srv_conn_queue_full(srv_conn)))
@@ -910,9 +949,15 @@ tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 	 * One runs under full set of restrictions, and the other runs
 	 * under restrictions that are slightly relaxed. It's likely
 	 * that servers probed in these two passes are not the same.
+	 *
+	 * It doesn't make sense to do lots of attempts. If a suitable
+	 * connection can not be found after multiple attempts, then
+	 * something is wrong with one or more upstream servers in
+	 * this group. Spinning in the loop here would just aggravate
+	 * the issue on Tempesta's side.
 	 */
-	attempts = rpool->srv_n * 2 + 1;
-	while (--attempts) {
+	attempts = rpool->srv_n;
+	while (attempts--) {
 		srvdesc = tfw_sched_ratio_next_srv(rpool, ratio);
 		if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn))) {
 			rcu_read_unlock();
@@ -943,22 +988,10 @@ tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
 
 	/* Free the data that is shared between pool entries. */
 	for (si = 0; si < sg->srv_n; ++si)
-		if (rpool->srvdesc[si].conns)
-			kfree(rpool->srvdesc[si].conns);
-	kfree(rpool->srvdesc);
-
-	/* Free the data that is unique for each pool entry. */
-	for (si = 0; si <= nr_cpu_ids; ++si)
-		if (rpool->rpool[si].srvdata)
-			kfree(rpool->rpool[si].srvdata);
+		kfree(rpool->srvdesc[si].conn);
 
 	/* Free the data allocated for predictive scheduler. */
-	if (rpool->hstdata) {
-		for (si = 0; si < sg->srv_n; ++si)
-			if (rpool->hstdata->past[si].hist)
-				kfree(rpool->hstdata->past[si].hist);
-		kfree(rpool->hstdata);
-	}
+	kfree(rpool->hstdata);
 
 	kfree(rpool);
 	sg->sched_data = NULL;
@@ -996,33 +1029,6 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 	tfw_sched_ratio_cleanup(sg);
 }
 
-/**     
- * Validate the integrity of a group.
- *
- * Make sure that number of servers in the group, and the number
- * of connections for each server match the recorded values.
- */
-static int
-tfw_sched_ratio_validate_grp(TfwSrvGroup *sg)
-{
-	size_t si = 0, ci;
-	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-
-	list_for_each_entry(srv, &sg->srv_list, list) {
-		ci = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			++ci;
-		if (ci > srv->conn_n)
-			return -EINVAL;
-		++si;
-	}
-	if (si > sg->srv_n)
-		return -EINVAL;
-
-	return 0;
-}
-
 /**
  * Add a server group to Ratio Scheduler.
  *
@@ -1035,77 +1041,107 @@ tfw_sched_ratio_validate_grp(TfwSrvGroup *sg)
 static int
 tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 {
-	int ret = -ENOMEM;
+	int ret = -EINVAL;
 	size_t size, si, ci;
 	TfwServer *srv;
-	TfwSrvConn *srv_conn;
-	TfwRatio *ratio;
 	TfwRatioPool *rpool;
 	TfwRatioSrvDesc *srvdesc;
+	TfwRatioSrvData *srvdata;
+	TfwRatio *ratio, *ratio_end;
 	void *sched_data = sg->sched_data;
 
-	sg->sched_data = NULL;
-
-	if (tfw_sched_ratio_validate_grp(sg))
+	if (unlikely(!sg->srv_n || list_empty(&sg->srv_list)))
 		return -EINVAL;
 
-	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
-	size = sizeof(TfwRatioPool) + sizeof(TfwRatio) * (nr_cpu_ids + 1);
+	size = sizeof(TfwRatioPool)
+	       + sizeof(TfwRatio) * (nr_cpu_ids + 1)
+	       + sizeof(TfwRatioSrvDesc) * sg->srv_n
+	       + sizeof(TfwRatioSrvData) * sg->srv_n * (nr_cpu_ids + 1);
 	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
 		return -ENOMEM;
+
+	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
 	rpool = sg->sched_data;
 	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
+	ratio_end = rpool->rpool + nr_cpu_ids + 1;
 
 	/* Array for server descriptors. Shared between RCU pool entries. */
-	size = sizeof(TfwRatioSrvDesc) * sg->srv_n;
-	if (!(rpool->srvdesc = kzalloc(size, GFP_KERNEL)))
-		goto cleanup;
+	rpool->srvdesc = (TfwRatioSrvDesc *)ratio_end;
 	rpool->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
 	rpool->srv_n = sg->srv_n;
 
 	/* Set up each RCU pool entry with required arrays and data. */
-	size = sizeof(TfwRatioSrvData) * sg->srv_n;
-	for (si = 0, ratio = rpool->rpool; si <= nr_cpu_ids; ++si, ++ratio) {
-		if (!(ratio->srvdata = kzalloc(size, GFP_KERNEL)))
-			goto cleanup;
+	srvdata = (TfwRatioSrvData *)(rpool->srvdesc + sg->srv_n);
+	for (ratio = rpool->rpool; ratio < ratio_end; ++ratio) {
+		ratio->srvdata = srvdata;
 		spin_lock_init(&ratio->schdata.lock);
 		atomic_set(&ratio->free, 1);
+		srvdata += sg->srv_n;
 	}
 
 	/* Initial setup of upstream server descriptors. */
+	si = 0;
 	srvdesc = rpool->srvdesc;
 	list_for_each_entry(srv, &sg->srv_list, list) {
+		TfwSrvConn **conn, *srv_conn;
+
+		if (unlikely((si++ == sg->srv_n) || !srv->conn_n
+			     || list_empty(&srv->conn_list)))
+			goto cleanup;
+
 		size = sizeof(TfwSrvConn *) * srv->conn_n;
-		if (!(srvdesc->conns = kzalloc(size, GFP_KERNEL)))
+		if (!(srvdesc->conn = kzalloc(size, GFP_KERNEL))) {
+			ret = -ENOMEM;
 			goto cleanup;
+		}
+
 		ci = 0;
-		list_for_each_entry(srv_conn, &srv->conn_list, list)
-			srvdesc->conns[ci++] = srv_conn;
+		conn = srvdesc->conn;
+		list_for_each_entry(srv_conn, &srv->conn_list, list) {
+			if (unlikely(ci++ == srv->conn_n))
+				goto cleanup;
+			*conn++ = srv_conn;
+		}
+		if (unlikely(ci != srv->conn_n))
+			goto cleanup;
+
 		srvdesc->conn_n = srv->conn_n;
 		srvdesc->srv = srv;
 		atomic64_set(&srvdesc->counter, 0);
 		srv->sched_data = srvdesc;
 		++srvdesc;
 	}
+	if (unlikely(si != sg->srv_n))
+		goto cleanup;
 
 	/* Set up the necessary workspace for predictive scheduler. */
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) {
-		TfwRatioHstData *hstdata;
+		size_t slot_n;
+		TfwRatioHstUnit *hunit;
+		TfwRatioHstData *hdata;
+		TfwRatioHstDesc *hpast, *hpast_end;
 		TfwSchrefPredict *schref = sched_data;
 		BUG_ON(!schref);
+
+		slot_n = schref->past * schref->rate;
 		size = sizeof(TfwRatioHstData)
-		       + sizeof(TfwRatioHstDesc) * sg->srv_n;
-		if (!(rpool->hstdata = kzalloc(size, GFP_KERNEL)))
+		       + sizeof(TfwRatioHstDesc) * sg->srv_n
+		       + sizeof(TfwRatioHstUnit) * sg->srv_n * slot_n;
+
+		if (!(rpool->hstdata = kzalloc(size, GFP_KERNEL))) {
+			ret = -ENOMEM;
 			goto cleanup;
-		hstdata = rpool->hstdata;
-		hstdata->past = (TfwRatioHstDesc *)(hstdata + 1);
-		hstdata->past_sz = schref->past * schref->rate;
-		hstdata->ahead = schref->ahead * schref->rate;
-		size = sizeof(TfwRatioHstXY) * hstdata->past_sz;
-		for (si = 0; si < sg->srv_n; ++si) {
-			TfwRatioHstDesc *hd = &hstdata->past[si];
-			if (!(hd->hist = kzalloc(size, GFP_KERNEL)))
-				goto cleanup;
+		}
+		hdata = rpool->hstdata;
+		hdata->past = (TfwRatioHstDesc *)(hdata + 1);
+		hdata->slot_n = slot_n;
+		hdata->ahead = schref->ahead * schref->rate;
+
+		hpast_end = hdata->past + sg->srv_n;
+		hunit = (TfwRatioHstUnit *)hpast_end;
+		for (hpast = hdata->past; hpast < hpast_end; ++hpast) {
+			hpast->hist = hunit;
+			hunit += slot_n;
 		}
 	}
 
diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c
index 4d4e8a119..5ef4938c5 100644
--- a/tempesta_fw/server.c
+++ b/tempesta_fw/server.c
@@ -222,6 +222,11 @@ tfw_sg_for_each_srv(int (*cb)(TfwServer *srv))
 
 /**
  * Release all server groups with all servers.
+ *
+ * Note: The function is called at shutdown and in user context when
+ * it's guaranteed that all activity has stopped. Therefore the locks
+ * are not just not necessary, they can't be used as the code in user
+ * context may sleep.
  */
 void
 tfw_sg_release_all(void)
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 2813e80c8..6ad8e6081 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -459,8 +459,10 @@ tfw_sock_srv_connect_srv(TfwServer *srv)
 	 * is locked, and spews lots of warnings. LOCKDEP doesn't know
 	 * that parallel execution can't happen with the same socket.
 	 */
-	return tfw_peer_for_each_conn(srv, srv_conn, list,
-				      __tfw_sock_srv_connect_try_later_cb);
+	list_for_each_entry(srv_conn, &srv->conn_list, list)
+		tfw_sock_srv_connect_try_later(srv_conn);
+
+	return 0;
 }
 
 /**
diff --git a/tempesta_fw/t/unit/user_space/percentiles.c b/tempesta_fw/t/unit/user_space/percentiles.c
index 278c72821..021695450 100644
--- a/tempesta_fw/t/unit/user_space/percentiles.c
+++ b/tempesta_fw/t/unit/user_space/percentiles.c
@@ -174,7 +174,7 @@ static void
 __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r)
 {
 	int i;
-	unsigned long tmp;
+	unsigned long cnt_full, cnt_half;
 
 	--pc->order;
 	pc->begin = pc->end - ((TFW_STAT_BCKTS - 1) << pc->order);
@@ -190,14 +190,15 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r)
 	 */
 	for (i = 1; i < TFW_STAT_BCKTS / 2; ++i)
 		atomic_add(atomic_read(&rng->cnt[r][i]), &rng->cnt[r][0]);
-	tmp = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2]) / 2;
-	atomic_add(tmp, &rng->cnt[r][0]);
-	atomic_set(&rng->cnt[r][1], tmp);
+	cnt_full = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2]);
+	cnt_half = cnt_full / 2;
+	atomic_add(cnt_half, &rng->cnt[r][0]);
+	atomic_set(&rng->cnt[r][1], cnt_full - cnt_half);
 	for (i = 1; i < TFW_STAT_BCKTS / 2; ++i) {
-		tmp = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2 + i]);
-		tmp /= 2;
-		atomic_set(&rng->cnt[r][i * 2], tmp);
-		atomic_set(&rng->cnt[r][i * 2 + 1], tmp);
+		cnt_full = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2 + i]);
+		cnt_half = cnt_full / 2;
+		atomic_set(&rng->cnt[r][i * 2], cnt_half);
+		atomic_set(&rng->cnt[r][i * 2 + 1], cnt_full - cnt_half);
 	}
 }
 

From f6e11be2e5499b37febfcd75bad6497e9dfff817 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 18 May 2017 12:32:13 +0300
Subject: [PATCH 30/37] Fix the implementation of predictive ratio algorithm.

Eliminate integer promotion from signed to unsigned.
---
 tempesta_fw/sched/tfw_sched_ratio.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 73b0aaeee..5233c92a1 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -532,11 +532,13 @@ tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
 static void
 tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
 {
+	static const long MUL = 1000;
+	int ni, sz;
+	size_t si, max_val_idx;
+	unsigned long sum_wgt;
+	long cnt, rtt, ahead, prediction;
 	TfwRatioHstData *hstdata = rpool->hstdata;
 	TfwRatioSrvData *srvdata = ratio->srvdata;
-	static const long MUL = 1000;
-	unsigned long cnt, rtt, ahead, sum_wgt;
-	size_t si, sz, ni, max_val_idx;
 
 	ni = hstdata->counter % hstdata->slot_n;
 	cnt = hstdata->counter * MUL;
@@ -544,7 +546,6 @@ tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
 
 	sum_wgt = max_val_idx = 0;
 	for (si = 0; si < rpool->srv_n; ++si) {
-		long prediction;
 		TfwRatioHstDesc *hd = &hstdata->past[si];
 
 		__tfw_sched_ratio_get_rtt(si, rpool, ratio);
@@ -567,8 +568,8 @@ tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
 				(hd->cnt_sq_avg * ni + cnt * cnt) / sz;
 			hd->cnt_avg_sq = hd->cnt_avg * hd->cnt_avg;
 		} else {
-			unsigned long h_cnt = hd->hist[ni].cnt;
-			unsigned long h_rtt = hd->hist[ni].rtt;
+			long h_cnt = hd->hist[ni].cnt;
+			long h_rtt = hd->hist[ni].rtt;
 			sz = hstdata->slot_n;
 			hd->cnt_avg = hd->cnt_avg - (h_cnt - cnt) / sz;
 			hd->rtt_avg = hd->rtt_avg - (h_rtt - rtt) / sz;

From 1850314e6824ff86d08e5a3d18e055e84e0875ce Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 18 May 2017 13:33:56 +0300
Subject: [PATCH 31/37] Replace ratio->free with ratio->busy for better
 clarity.

---
 tempesta_fw/sched/tfw_sched_ratio.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 5233c92a1..5f1f0f64e 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -150,13 +150,13 @@ typedef struct {
  * the whole run-time. That may change in the future.
  *
  * @rcu		- RCU control structure;
- * @free	- indicates that the pool entry is available for use.
+ * @busy	- indicates that the pool entry is currently used.
  * @srvdata	- scheduler data specific to each server in the group.
  * @schdata	- scheduler data common to all servers in the group.
  */
 typedef struct {
 	struct rcu_head		rcu;
-	atomic_t		free;
+	atomic_t		busy;
 	TfwRatioSrvData		*srvdata;
 	TfwRatioSchData		schdata;
 } TfwRatio;
@@ -610,7 +610,7 @@ tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
 
 /**
  * Get a free for use entry from the RCU pool.
- * Note that @ratio->free is always either 1 or 0.
+ * Note that @ratio->busy is always either 1 or 0.
  */
 static TfwRatio *
 tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
@@ -619,7 +619,7 @@ tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
 	TfwRatio *ratio = rpool->rpool;
 
 	for (si = 0; si <= nr_cpu_ids; ++si, ++ratio)
-		if (atomic_cmpxchg(&ratio->free, 1, 0))
+		if (!atomic_cmpxchg(&ratio->busy, 0, 1))
 			return ratio;
 
 	return NULL;
@@ -628,17 +628,11 @@ tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
 /**
  * Return an entry to the RCU pool.
  */
-static inline void
-__tfw_sched_ratio_rpool_put(TfwRatio *ratio)
-{
-	atomic_set(&ratio->free, 1);
-}
-
 static void
 tfw_sched_ratio_rpool_put(struct rcu_head *rcup)
 {
 	TfwRatio *ratio = container_of(rcup, TfwRatio, rcu);
-	__tfw_sched_ratio_rpool_put(ratio);
+	atomic_set(&ratio->busy, 0);
 }
 
 /**
@@ -1066,17 +1060,17 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
 	ratio_end = rpool->rpool + nr_cpu_ids + 1;
 
-	/* Array for server descriptors. Shared between RCU pool entries. */
-	rpool->srvdesc = (TfwRatioSrvDesc *)ratio_end;
-	rpool->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
 	rpool->srv_n = sg->srv_n;
+	rpool->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
+
+	/* Array of server descriptors. Shared between RCU pool entries. */
+	rpool->srvdesc = (TfwRatioSrvDesc *)ratio_end;
 
 	/* Set up each RCU pool entry with required arrays and data. */
 	srvdata = (TfwRatioSrvData *)(rpool->srvdesc + sg->srv_n);
 	for (ratio = rpool->rpool; ratio < ratio_end; ++ratio) {
-		ratio->srvdata = srvdata;
 		spin_lock_init(&ratio->schdata.lock);
-		atomic_set(&ratio->free, 1);
+		ratio->srvdata = srvdata;
 		srvdata += sg->srv_n;
 	}
 

From 42140b3684d3988085a56279fdcfbcfe7654a8e2 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Sun, 28 May 2017 23:10:49 +0300
Subject: [PATCH 32/37] Code rework to address comments in recent code review.

---
 tempesta_fw/sched/tfw_sched_ratio.c | 465 +++++++++++++++-------------
 tempesta_fw/server.h                |   5 +-
 tempesta_fw/sock_srv.c              |   9 +-
 3 files changed, 254 insertions(+), 225 deletions(-)

diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c
index 5f1f0f64e..770b678dc 100644
--- a/tempesta_fw/sched/tfw_sched_ratio.c
+++ b/tempesta_fw/sched/tfw_sched_ratio.c
@@ -54,7 +54,7 @@ typedef struct {
 } TfwRatioSrvDesc;
 
 /**
- * Server data for scheduler.
+ * Individual server data for scheduler.
  *
  * @sdidx	- index of server descriptor this data is for.
  * @weight	- server weight.
@@ -134,57 +134,53 @@ typedef struct {
  * @ahead	- predict for this number of @intvl ahead.
  * @slot_n	- total number of slots for past data.
  * @counter	- slot that is available for storing past data.
- * @past	- past data for each server (@past[@srv_n]).
+ * @hstdesc	- past data for each server (@hstdesc[@srv_n]).
  */
 typedef struct {
 	unsigned int	ahead;
 	size_t		slot_n;
 	unsigned long	counter;
-	TfwRatioHstDesc	*past;
+	TfwRatioHstDesc	*hstdesc;
 } TfwRatioHstData;
 
 /**
- * The main Ratio Scheduler structure.
+ * The main Ratio Scheduler data structure.
  *
  * All servers, either dead or live, are present in the list during
  * the whole run-time. That may change in the future.
  *
- * @rcu		- RCU control structure;
- * @busy	- indicates that the pool entry is currently used.
+ * @rcu		- RCU control structure.
  * @srvdata	- scheduler data specific to each server in the group.
  * @schdata	- scheduler data common to all servers in the group.
  */
 typedef struct {
 	struct rcu_head		rcu;
-	atomic_t		busy;
 	TfwRatioSrvData		*srvdata;
 	TfwRatioSchData		schdata;
-} TfwRatio;
+} TfwRatioData;
 
 /**
- * The pool of TfwRatio{} structures for RCU.
+ * The main structure for the group.
  *
  * @srv_n	- number of upstream servers.
  * @psidx	- APM pstats[] value index for dynamic ratios.
  * @intvl	- interval for re-arming the timer.
- * @rpool	- pool of TfwRatio{} for RCU.
- * @ratio	- pointer to the currently used structure.
- * @hstdata	- historic data for predictive scheduler.
- * @srvdesc	- array of upstream server descriptors.
  * @rearm	- indicates if the timer can be re-armed.
  * @timer	- periodic timer for dynamic APM data.
+ * @hstdata	- historic data for predictive scheduler.
+ * @srvdesc	- array of upstream server descriptors.
+ * @rtodata	- pointer to the currently used scheduler data.
  */
 typedef struct {
 	size_t			srv_n;
 	size_t			psidx;
 	unsigned int		intvl;
-	TfwRatio		*rpool;
-	TfwRatio __rcu		*ratio;
-	TfwRatioHstData		*hstdata;
-	TfwRatioSrvDesc		*srvdesc;
 	atomic_t		rearm;
 	struct timer_list	timer;
-} TfwRatioPool;
+	TfwRatioHstData		*hstdata;
+	TfwRatioSrvDesc		*srvdesc;
+	TfwRatioData __rcu	*rtodata;
+} TfwRatio;
 
 /**
  * Swap two server data entries. Required for sorting by sort().
@@ -219,20 +215,20 @@ tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs)
  * Return a non-zero value if additional actions are needed.
  */
 static int
-tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio,
+tfw_sched_ratio_calc(TfwRatio *ratio, TfwRatioData *rtodata,
 		     unsigned long sum_wgt, size_t max_val_idx,
 		     size_t *arg_ovidx)
 {
 	size_t si, one_val_idx;
 	unsigned int diff, max_wgt, oratio;
 	unsigned long unit, sum_ratio = 0;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
-	TfwRatioSchData *schdata = &ratio->schdata;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
+	TfwRatioSchData *schdata = &rtodata->schdata;
 
 	/* Set up the common part of scheduler data. */
 	schdata->csidx = 0;
 	schdata->riter = 1;
-	schdata->reidx = rpool->srv_n;
+	schdata->reidx = ratio->srv_n;
 
 	/*
 	 * Calculate each server's ratio using the following formula:
@@ -243,8 +239,8 @@ tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio,
 	 */
 	diff = one_val_idx = 0;
 	max_wgt = srvdata[max_val_idx].weight;
-	unit = ((max_wgt + rpool->srv_n) * max_wgt) / sum_wgt;
-	for (si = 0; si < rpool->srv_n; ++si) {
+	unit = ((max_wgt + ratio->srv_n) * max_wgt) / sum_wgt;
+	for (si = 0; si < ratio->srv_n; ++si) {
 		oratio = (unit * srvdata[si].weight) / max_wgt ? : 1;
 		srvdata[si].cratio = srvdata[si].oratio = oratio;
 		diff |= (oratio != srvdata[0].oratio);
@@ -265,13 +261,13 @@ tfw_sched_ratio_calc(TfwRatioPool *rpool, TfwRatio *ratio,
  * weights that are statically defined in the configuration file.
  */
 static void
-tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
+tfw_sched_ratio_calc_static(TfwRatio *ratio, TfwRatioData *rtodata)
 {
 	unsigned long sum_wgt;
 	unsigned int diff;
 	size_t si, max_val_idx, one_val_idx;
-	TfwRatioSrvDesc *srvdesc = rpool->srvdesc;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
 
 	/*
 	 * Collect server weights from the configuration. Calculate the
@@ -281,7 +277,7 @@ tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
 	 * are the same.
 	 */
 	sum_wgt = diff = max_val_idx = 0;
-	for (si = 0; si < rpool->srv_n; ++si) {
+	for (si = 0; si < ratio->srv_n; ++si) {
 		unsigned int weight = srvdesc[si].srv->weight;
 		srvdata[si].sdidx = si;
 		srvdata[si].weight = weight;
@@ -297,23 +293,23 @@ tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
 	 * do anything else. Set up all ratios to 1 and be done with it.
 	 */
 	if (!diff) {
-		TfwRatioSchData *schdata = &ratio->schdata;
+		TfwRatioSchData *schdata = &rtodata->schdata;
 
 		/* Set up the common part of scheduler data. */
 		schdata->csidx = 0;
 		schdata->riter = 1;
-		schdata->reidx = rpool->srv_n;
+		schdata->reidx = ratio->srv_n;
 
-		schdata->crsum = schdata->orsum = rpool->srv_n;
+		schdata->crsum = schdata->orsum = ratio->srv_n;
 	}
 
 	/* Calculate ratios based on different weights of servers. */
-	if (!tfw_sched_ratio_calc(rpool, ratio, sum_wgt,
+	if (!tfw_sched_ratio_calc(ratio, rtodata, sum_wgt,
 				  max_val_idx, &one_val_idx))
 		return;
 
 	/* Sort server data entries by ratio in descending order. */
-	sort(srvdata, rpool->srv_n, sizeof(srvdata[0]),
+	sort(srvdata, ratio->srv_n, sizeof(srvdata[0]),
 	     tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap);
 }
 
@@ -356,15 +352,15 @@ tfw_sched_ratio_calc_static(TfwRatioPool *rpool, TfwRatio *ratio)
  *    ratio:   60  60  60  50  30  25  15  10   5   1
  */
 static void
-__tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio,
+__tfw_sched_ratio_calc_dynamic(TfwRatio *ratio, TfwRatioData *rtodata,
 			       unsigned long sum_wgt, size_t max_val_idx)
 {
 	size_t si, one_val_idx, left, right;
 	unsigned int max_ratio, has_one_val;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
 
 	/* Calculate ratios based on server RTT values. */
-	if (!tfw_sched_ratio_calc(rpool, ratio, sum_wgt,
+	if (!tfw_sched_ratio_calc(ratio, rtodata, sum_wgt,
 				  max_val_idx, &one_val_idx))
 		return;
 
@@ -377,14 +373,14 @@ __tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio,
 	has_one_val = (srvdata[one_val_idx].oratio == 1);
 
 	if (has_one_val) {
-		unsigned long orsum = ratio->schdata.orsum;
+		unsigned long orsum = rtodata->schdata.orsum;
 		TfwRatioSrvData sdent_one = srvdata[one_val_idx];
 		TfwRatioSrvData sdent_max = srvdata[max_val_idx];
 
 		/* Save maximum ratio value for future use. */
 		max_ratio = srvdata[max_val_idx].oratio;
 
-		for (si = 0; si < rpool->srv_n; ++si) {
+		for (si = 0; si < ratio->srv_n; ++si) {
 			if (srvdata[si].oratio == 1) {
 				srvdata[si].weight = sdent_max.weight;
 				srvdata[si].oratio =
@@ -397,11 +393,11 @@ __tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio,
 				orsum -= sdent_max.oratio - 1;
 			}
 		}
-		ratio->schdata.crsum = ratio->schdata.orsum = orsum;
+		rtodata->schdata.crsum = rtodata->schdata.orsum = orsum;
 	}
 
 	/* Sort server data entries by ratio in descending order. */
-	sort(srvdata, rpool->srv_n, sizeof(srvdata[0]),
+	sort(srvdata, ratio->srv_n, sizeof(srvdata[0]),
 	     tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap);
 
 	/*
@@ -413,9 +409,9 @@ __tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio,
 	 */
 	if (!has_one_val) {
 		left = 0;
-		right = rpool->srv_n - 1;
+		right = ratio->srv_n - 1;
 	} else {
-		for (si = 0; si < rpool->srv_n; ++si)
+		for (si = 0; si < ratio->srv_n; ++si)
 			if (srvdata[si].oratio == max_ratio) {
 				left = si + 1;
 			} else if (srvdata[si].oratio == 1) {
@@ -455,7 +451,7 @@ __tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio,
  *    a partial recalculation of ratios?
  */
 static inline int
-__tfw_sched_ratio_get_rtt(size_t si, TfwRatioPool *rpool, TfwRatio *ratio)
+__tfw_sched_ratio_get_rtt(size_t si, TfwRatio *ratio, TfwRatioData *rtodata)
 {
 	unsigned int recalc;
 	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
@@ -464,15 +460,15 @@ __tfw_sched_ratio_get_rtt(size_t si, TfwRatioPool *rpool, TfwRatio *ratio)
 		.val = val,
 		.psz = ARRAY_SIZE(tfw_pstats_ith)
 	};
-	TfwRatioSrvData *srvdata = ratio->srvdata;
-	TfwRatioSrvDesc *srvdesc = rpool->srvdesc;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
+	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
 
 	pstats.seq = srvdesc[si].seq;
 	recalc = tfw_apm_stats(srvdesc[si].srv->apmref, &pstats);
 	srvdesc[si].seq = pstats.seq;
 
 	srvdata[si].sdidx = si;
-	srvdata[si].weight = pstats.val[rpool->psidx] ? : 1;
+	srvdata[si].weight = pstats.val[ratio->psidx] ? : 1;
 
 	return recalc;
 }
@@ -482,29 +478,29 @@ __tfw_sched_ratio_get_rtt(size_t si, TfwRatioPool *rpool, TfwRatio *ratio)
  * Latest dynamic data is provided by APM module and represent RTT values
  * for each server in a group. Ratios are calculated on those RTT values.
  *
- * The function runs periodically on timer and provides the data that
- * is used by the ratio scheduler for outgoing requests.
+ * The function runs periodically on timer and provides the data that is
+ * used by the ratio scheduler for outgoing requests.
  */
 static void
-tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
+tfw_sched_ratio_calc_dynamic(TfwRatio *ratio, TfwRatioData *rtodata)
 {
 	size_t si, max_val_idx = 0;
 	unsigned long sum_wgt = 0;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
 
 	/*
 	 * Calculate the sum of server's weights in the group. Remember
 	 * the index of server data entry with maximum weight. That same
 	 * entry will also have the maximum ratio.
 	 */
-	for (si = 0; si < rpool->srv_n; ++si) {
-		__tfw_sched_ratio_get_rtt(si, rpool, ratio);
+	for (si = 0; si < ratio->srv_n; ++si) {
+		__tfw_sched_ratio_get_rtt(si, ratio, rtodata);
 		if (srvdata[max_val_idx].weight < srvdata[si].weight)
 			max_val_idx = si;
 		sum_wgt += srvdata[si].weight;
 	}
 
-	__tfw_sched_ratio_calc_dynamic(rpool, ratio, sum_wgt, max_val_idx);
+	__tfw_sched_ratio_calc_dynamic(ratio, rtodata, sum_wgt, max_val_idx);
 }
 
 /**
@@ -530,25 +526,25 @@ tfw_sched_ratio_calc_dynamic(TfwRatioPool *rpool, TfwRatio *ratio)
  * is used by the ratio scheduler for outgoing requests.
  */
 static void
-tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
+tfw_sched_ratio_calc_predict(TfwRatio *ratio, TfwRatioData *rtodata)
 {
 	static const long MUL = 1000;
 	int ni, sz;
 	size_t si, max_val_idx;
 	unsigned long sum_wgt;
 	long cnt, rtt, ahead, prediction;
-	TfwRatioHstData *hstdata = rpool->hstdata;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
+	TfwRatioHstData *hstdata = ratio->hstdata;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
 
 	ni = hstdata->counter % hstdata->slot_n;
 	cnt = hstdata->counter * MUL;
 	ahead = hstdata->counter + hstdata->ahead;
 
 	sum_wgt = max_val_idx = 0;
-	for (si = 0; si < rpool->srv_n; ++si) {
-		TfwRatioHstDesc *hd = &hstdata->past[si];
+	for (si = 0; si < ratio->srv_n; ++si) {
+		TfwRatioHstDesc *hd = &hstdata->hstdesc[si];
 
-		__tfw_sched_ratio_get_rtt(si, rpool, ratio);
+		__tfw_sched_ratio_get_rtt(si, ratio, rtodata);
 
 		rtt = srvdata[si].weight * MUL;
 
@@ -605,89 +601,80 @@ tfw_sched_ratio_calc_predict(TfwRatioPool *rpool, TfwRatio *ratio)
 
 	++hstdata->counter;
 
-	__tfw_sched_ratio_calc_dynamic(rpool, ratio, sum_wgt, max_val_idx);
+	__tfw_sched_ratio_calc_dynamic(ratio, rtodata, sum_wgt, max_val_idx);
 }
 
 /**
- * Get a free for use entry from the RCU pool.
- * Note that @ratio->busy is always either 1 or 0.
+ * Get and set up a new ratio data entry.
  */
-static TfwRatio *
-tfw_sched_ratio_rpool_get(TfwRatioPool *rpool)
+static TfwRatioData *
+tfw_sched_ratio_rtodata_get(TfwRatio *ratio)
 {
-	int si;
-	TfwRatio *ratio = rpool->rpool;
+	size_t size;
+	TfwRatioData *rtodata;
 
-	for (si = 0; si <= nr_cpu_ids; ++si, ++ratio)
-		if (!atomic_cmpxchg(&ratio->busy, 0, 1))
-			return ratio;
+	size = sizeof(TfwRatioData) + sizeof(TfwRatioSrvData) * ratio->srv_n;
+	if (!(rtodata = kmalloc(size, GFP_ATOMIC)))
+		return NULL;
+	rtodata->srvdata = (TfwRatioSrvData *)(rtodata + 1);
+	spin_lock_init(&rtodata->schdata.lock);
 
-	return NULL;
+	return rtodata;
 }
 
 /**
- * Return an entry to the RCU pool.
+ * Release a ratio data entry that is no longer used.
  */
 static void
-tfw_sched_ratio_rpool_put(struct rcu_head *rcup)
+tfw_sched_ratio_rtodata_put(struct rcu_head *rcup)
 {
-	TfwRatio *ratio = container_of(rcup, TfwRatio, rcu);
-	atomic_set(&ratio->busy, 0);
+	TfwRatioData *rtodata = container_of(rcup, TfwRatioData, rcu);
+	kfree(rtodata);
 }
 
 /**
  * Calculate the latest ratios for each server in the group in real time.
  *
  * RCU is used to avoid locks. When recalculation is in order, the new
- * data is placed in an available entry from the RCU pool. The new entry
- * then is seamlessly set as the current entry. The formerly active entry
- * is returned to the RCU pool when all users of it are done and gone.
- *
- * It may happen that no RCU pool entry is available at the moment.
- * That's not a big deal. Scheduling of upstream servers will continue
- * to run on currently active data. The timer is scheduled to run ASAP
- * and catch an RCU pool entry the moment it gets available.
- * To make this case less probable, the number of RCU pool entries
- * is chosen as one more than the number of CPU slots in the system.
+ * data is placed in a new allocated entry. The new entry is seamlessly
+ * set as the current entry by using RCU. The formerly active entry is
+ * released in due time when all users of it are done and gone.
  */
 static void
 tfw_sched_ratio_calc_tmfn(TfwSrvGroup *sg,
-			  void (*calc_fn)(TfwRatioPool *, TfwRatio *))
+			  void (*calc_fn)(TfwRatio *, TfwRatioData *))
 {
-	TfwRatioPool *rpool = sg->sched_data;
-	TfwRatio *cratio, *nratio;
-	int interval = rpool->intvl;
+	TfwRatio *ratio = sg->sched_data;
+	TfwRatioData *crtodata, *nrtodata;
 
 	/*
-	 * Get an available ratio entry from the RCU pool. If there's
-	 * none at the moment, then try it again in a short while on
-	 * the next run of timer function.
+	 * Get a new ratio data entry. Usually, if unsuccessful, that's
+	 * not a big deal. Scheduling of upstream servers will continue
+	 * to run on currently active data. However, the lack of memory
+	 * is a critical issue in itself.
 	 */
-	nratio = tfw_sched_ratio_rpool_get(rpool);
-	if (unlikely(!nratio)) {
-		interval = 1;
+	if (!(nrtodata = tfw_sched_ratio_rtodata_get(ratio))) {
+		TFW_ERR("Sched ratio: Insufficient memory for group '%s'\n",
+			sg->name);
 		goto rearm;
 	}
 
-	/*
-	 * Calculate dynamic ratios. If there's nothing to do,
-	 * then return the ratio entry back to the RCU pool.
-	 */
-	calc_fn(rpool, nratio);
+	/* Calculate dynamic ratios. */
+	calc_fn(ratio, nrtodata);
 
 	/*
-	 * Substitute the current ratio entry with the new one for
-	 * scheduler. The former entry will be returned to the RCU
-	 * pool when there are no users of it.
+	 * Substitute the current ratio data entry with the new one for
+	 * the scheduler. The former entry will be released when there
+	 * are no users of it. Use the faster non-lazy RCU.
 	 */
-	cratio = rpool->ratio;
-	rcu_assign_pointer(rpool->ratio, nratio);
-	call_rcu(&cratio->rcu, tfw_sched_ratio_rpool_put);
+	crtodata = ratio->rtodata;
+	rcu_assign_pointer(ratio->rtodata, nrtodata);
+	call_rcu(&crtodata->rcu, tfw_sched_ratio_rtodata_put);
 
 rearm:
 	smp_mb();
-	if (atomic_read(&rpool->rearm))
-		mod_timer(&rpool->timer, jiffies + interval);
+	if (atomic_read(&ratio->rearm))
+		mod_timer(&ratio->timer, jiffies + ratio->intvl);
 }
 
 /**
@@ -730,20 +717,20 @@ tfw_sched_ratio_predict_tmfn(unsigned long tmfn_data)
  * TODO: The algorithm may and should be improved.
  */
 static inline bool
-tfw_sched_ratio_is_srv_turn(TfwRatioPool *rpool, TfwRatio *ratio, size_t csidx)
+tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, TfwRatioData *rtodata, size_t csidx)
 {
 	unsigned long headsum2, tailsum2;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
-	TfwRatioSchData *schdata = &ratio->schdata;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
+	TfwRatioSchData *schdata = &rtodata->schdata;
 
 	if (!csidx)
 		return true;
 
 	headsum2 = (srvdata[0].cratio + srvdata[csidx - 1].cratio) * csidx;
 	tailsum2 = (srvdata[csidx].cratio
-		    + (srvdata[rpool->srv_n - 1].cratio
-		       ? : srvdata[rpool->srv_n - 1].oratio))
-		   * (rpool->srv_n - csidx);
+		    + (srvdata[ratio->srv_n - 1].cratio
+		       ? : srvdata[ratio->srv_n - 1].oratio))
+		   * (ratio->srv_n - csidx);
 
 	return tailsum2 * schdata->riter > headsum2;
 }
@@ -761,11 +748,11 @@ tfw_sched_ratio_is_srv_turn(TfwRatioPool *rpool, TfwRatio *ratio, size_t csidx)
  * that it won't give any advantage.
  */
 static TfwRatioSrvDesc *
-tfw_sched_ratio_next_srv(TfwRatioPool *rpool, TfwRatio *ratio)
+tfw_sched_ratio_next_srv(TfwRatio *ratio, TfwRatioData *rtodata)
 {
 	size_t csidx;
-	TfwRatioSrvData *srvdata = ratio->srvdata;
-	TfwRatioSchData *schdata = &ratio->schdata;
+	TfwRatioSrvData *srvdata = rtodata->srvdata;
+	TfwRatioSchData *schdata = &rtodata->schdata;
 
 	/* Start with server that has the highest ratio. */
 	spin_lock(&schdata->lock);
@@ -783,7 +770,7 @@ tfw_sched_ratio_next_srv(TfwRatioPool *rpool, TfwRatio *ratio)
 		 */
 		if (schdata->reidx != csidx) {
 			++schdata->csidx;
-			if (schdata->csidx == rpool->srv_n) {
+			if (schdata->csidx == ratio->srv_n) {
 				schdata->csidx = 0;
 				schdata->riter = 1;
 			}
@@ -802,20 +789,20 @@ tfw_sched_ratio_next_srv(TfwRatioPool *rpool, TfwRatio *ratio)
 	 * the group, then also start from the beginning, but do not
 	 * reset as it's been reset already (make sure of that).
 	 */
-	if (likely(tfw_sched_ratio_is_srv_turn(rpool, ratio, csidx))) {
+	if (likely(tfw_sched_ratio_is_srv_turn(ratio, rtodata, csidx))) {
 		--srvdata[csidx].cratio;
 		if (unlikely(!--schdata->crsum)) {
 			schdata->csidx = 0;
 			schdata->riter = 1;
 			schdata->crsum = schdata->orsum;
 			schdata->reidx = 0;
-		} else if (unlikely(++schdata->csidx == rpool->srv_n)) {
-			BUG_ON(schdata->reidx != rpool->srv_n);
+		} else if (unlikely(++schdata->csidx == ratio->srv_n)) {
+			BUG_ON(schdata->reidx != ratio->srv_n);
 			schdata->csidx = 0;
 			schdata->riter = 1;
 		}
 		spin_unlock(&schdata->lock);
-		return rpool->srvdesc + srvdata[csidx].sdidx;
+		return ratio->srvdesc + srvdata[csidx].sdidx;
 	}
 	/*
 	 * This is not the turn of the current server. Start
@@ -912,16 +899,16 @@ static TfwSrvConn *
 tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 {
 	unsigned int attempts, skipnip = 1, nipconn = 0;
-	TfwRatioPool *rpool = sg->sched_data;
+	TfwRatio *ratio = sg->sched_data;
 	TfwRatioSrvDesc *srvdesc;
 	TfwSrvConn *srv_conn;
-	TfwRatio *ratio;
+	TfwRatioData *rtodata;
 
-	BUG_ON(!rpool);
+	BUG_ON(!ratio);
 
 	rcu_read_lock();
-	ratio = rcu_dereference(rpool->ratio);
-	BUG_ON(!ratio);
+	rtodata = rcu_dereference(ratio->rtodata);
+	BUG_ON(!rtodata);
 rerun:
 	/*
 	 * Try servers in a group according to their ratios. Attempt to
@@ -951,9 +938,9 @@ tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg)
 	 * this group. Spinning in the loop here would just aggravate
 	 * the issue on Tempesta's side.
 	 */
-	attempts = rpool->srv_n;
+	attempts = ratio->srv_n;
 	while (attempts--) {
-		srvdesc = tfw_sched_ratio_next_srv(rpool, ratio);
+		srvdesc = tfw_sched_ratio_next_srv(ratio, rtodata);
 		if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn))) {
 			rcu_read_unlock();
 			return srv_conn;
@@ -976,19 +963,19 @@ static void
 tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
 {
 	size_t si;
-	TfwRatioPool *rpool = sg->sched_data;
+	TfwRatio *ratio = sg->sched_data;
 
-	if (!rpool)
+	if (!ratio)
 		return;
 
-	/* Free the data that is shared between pool entries. */
+	/* Data that is shared between pool entries. */
 	for (si = 0; si < sg->srv_n; ++si)
-		kfree(rpool->srvdesc[si].conn);
+		kfree(ratio->srvdesc[si].conn);
 
-	/* Free the data allocated for predictive scheduler. */
-	kfree(rpool->hstdata);
+	kfree(ratio->hstdata);
+	kfree(ratio->rtodata);
 
-	kfree(rpool);
+	kfree(ratio);
 	sg->sched_data = NULL;
 }
 
@@ -1003,7 +990,7 @@ tfw_sched_ratio_cleanup(TfwSrvGroup *sg)
 static void
 tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 {
-	TfwRatioPool *rpool = sg->sched_data;
+	TfwRatio *ratio = sg->sched_data;
 
 	/*
 	 * Make sure the timer doesn't re-arms itself. This
@@ -1012,9 +999,9 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 	if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
 			 | TFW_SG_F_SCHED_RATIO_PREDICT))
 	{
-		atomic_set(&rpool->rearm, 0);
+		atomic_set(&ratio->rearm, 0);
 		smp_mb__after_atomic();
-		del_timer_sync(&rpool->timer);
+		del_timer_sync(&ratio->timer);
 	}
 
 	/* Wait for outstanding RCU callbacks to complete. */
@@ -1033,72 +1020,36 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
  * Additional configuration data required for Predictive scheduler are
  * passed via @sg->sched_data.
  */
+
+/* Set up the upstream server descriptors. */
 static int
-tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
+tfw_sched_ratio_srvdesc_setup(TfwSrvGroup *sg)
 {
-	int ret = -EINVAL;
-	size_t size, si, ci;
+	size_t size, si = 0, ci;
 	TfwServer *srv;
-	TfwRatioPool *rpool;
-	TfwRatioSrvDesc *srvdesc;
-	TfwRatioSrvData *srvdata;
-	TfwRatio *ratio, *ratio_end;
-	void *sched_data = sg->sched_data;
-
-	if (unlikely(!sg->srv_n || list_empty(&sg->srv_list)))
-		return -EINVAL;
-
-	size = sizeof(TfwRatioPool)
-	       + sizeof(TfwRatio) * (nr_cpu_ids + 1)
-	       + sizeof(TfwRatioSrvDesc) * sg->srv_n
-	       + sizeof(TfwRatioSrvData) * sg->srv_n * (nr_cpu_ids + 1);
-	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
-		return -ENOMEM;
-
-	/* Pool of TfwRatio{}. Initial place for Ratio Scheduler data. */
-	rpool = sg->sched_data;
-	rpool->rpool = sg->sched_data + sizeof(TfwRatioPool);
-	ratio_end = rpool->rpool + nr_cpu_ids + 1;
-
-	rpool->srv_n = sg->srv_n;
-	rpool->psidx = sg->flags & TFW_SG_F_PSTATS_IDX_MASK;
-
-	/* Array of server descriptors. Shared between RCU pool entries. */
-	rpool->srvdesc = (TfwRatioSrvDesc *)ratio_end;
-
-	/* Set up each RCU pool entry with required arrays and data. */
-	srvdata = (TfwRatioSrvData *)(rpool->srvdesc + sg->srv_n);
-	for (ratio = rpool->rpool; ratio < ratio_end; ++ratio) {
-		spin_lock_init(&ratio->schdata.lock);
-		ratio->srvdata = srvdata;
-		srvdata += sg->srv_n;
-	}
+	TfwRatio *ratio = sg->sched_data;
+	TfwRatioSrvDesc *srvdesc = ratio->srvdesc;
 
-	/* Initial setup of upstream server descriptors. */
-	si = 0;
-	srvdesc = rpool->srvdesc;
 	list_for_each_entry(srv, &sg->srv_list, list) {
 		TfwSrvConn **conn, *srv_conn;
 
 		if (unlikely((si++ == sg->srv_n) || !srv->conn_n
 			     || list_empty(&srv->conn_list)))
-			goto cleanup;
+			return -EINVAL;
 
 		size = sizeof(TfwSrvConn *) * srv->conn_n;
-		if (!(srvdesc->conn = kzalloc(size, GFP_KERNEL))) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
+		if (!(srvdesc->conn = kzalloc(size, GFP_KERNEL)))
+			return -ENOMEM;
 
 		ci = 0;
 		conn = srvdesc->conn;
 		list_for_each_entry(srv_conn, &srv->conn_list, list) {
 			if (unlikely(ci++ == srv->conn_n))
-				goto cleanup;
+				return -EINVAL;
 			*conn++ = srv_conn;
 		}
 		if (unlikely(ci != srv->conn_n))
-			goto cleanup;
+			return -EINVAL;
 
 		srvdesc->conn_n = srv->conn_n;
 		srvdesc->srv = srv;
@@ -1107,71 +1058,143 @@ tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
 		++srvdesc;
 	}
 	if (unlikely(si != sg->srv_n))
-		goto cleanup;
+		return -EINVAL;
+
+	return 0;
+}
+
+static TfwRatio *
+tfw_sched_ration_add_grp_common(TfwSrvGroup *sg)
+{
+	int ret;
+	size_t size;
+	TfwRatio *ratio;
+	TfwRatioData *rtodata;
+
+	TFW_DBG2("%s: SG=[%s]\n", __func__, sg->name);
+
+	size = sizeof(TfwRatio) + sizeof(TfwRatioSrvDesc) * sg->srv_n;
+	if (!(sg->sched_data = kzalloc(size, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	ratio = sg->sched_data;
+	ratio->srv_n = sg->srv_n;
+	ratio->psidx = sg->flags & TFW_SG_M_PSTATS_IDX;
+
+	ratio->srvdesc = (TfwRatioSrvDesc *)(ratio + 1);
+	if ((ret = tfw_sched_ratio_srvdesc_setup(sg)))
+		return ERR_PTR(ret);
+
+	if (!(rtodata = tfw_sched_ratio_rtodata_get(ratio)))
+		return ERR_PTR(-ENOMEM);
+	rcu_assign_pointer(ratio->rtodata, rtodata);
+
+	return ratio;
+}
+
+static int
+tfw_sched_ratio_add_grp_static(TfwSrvGroup *sg)
+{
+	TfwRatio *ratio;
+
+	ratio = tfw_sched_ration_add_grp_common(sg);
+	if (IS_ERR(ratio))
+		return PTR_ERR(ratio);
+
+	/* Calculate the static ratio data for each server. */
+	tfw_sched_ratio_calc_static(ratio, ratio->rtodata);
+
+	return 0;
+}
+
+static int
+tfw_sched_ratio_add_grp_dynamic(TfwSrvGroup *sg)
+{
+	TfwRatio *ratio;
+	TfwSchrefPredict *schref = sg->sched_data;
+
+	TFW_DBG2("%s: SG=[%s]\n", __func__, sg->name);
+
+	ratio = tfw_sched_ration_add_grp_common(sg);
+	if (IS_ERR(ratio))
+		return PTR_ERR(ratio);
 
 	/* Set up the necessary workspace for predictive scheduler. */
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) {
-		size_t slot_n;
+		size_t size, slot_n;
 		TfwRatioHstUnit *hunit;
 		TfwRatioHstData *hdata;
-		TfwRatioHstDesc *hpast, *hpast_end;
-		TfwSchrefPredict *schref = sched_data;
+		TfwRatioHstDesc *hdesc, *hdesc_end;
+
 		BUG_ON(!schref);
 
 		slot_n = schref->past * schref->rate;
 		size = sizeof(TfwRatioHstData)
 		       + sizeof(TfwRatioHstDesc) * sg->srv_n
 		       + sizeof(TfwRatioHstUnit) * sg->srv_n * slot_n;
+		if (!(ratio->hstdata = kzalloc(size, GFP_KERNEL)))
+			return -ENOMEM;
 
-		if (!(rpool->hstdata = kzalloc(size, GFP_KERNEL))) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
-		hdata = rpool->hstdata;
-		hdata->past = (TfwRatioHstDesc *)(hdata + 1);
+		hdata = ratio->hstdata;
+		hdata->hstdesc = (TfwRatioHstDesc *)(hdata + 1);
 		hdata->slot_n = slot_n;
 		hdata->ahead = schref->ahead * schref->rate;
 
-		hpast_end = hdata->past + sg->srv_n;
-		hunit = (TfwRatioHstUnit *)hpast_end;
-		for (hpast = hdata->past; hpast < hpast_end; ++hpast) {
-			hpast->hist = hunit;
+		hdesc_end = hdata->hstdesc + sg->srv_n;
+		hunit = (TfwRatioHstUnit *)hdesc_end;
+		for (hdesc = hdata->hstdesc; hdesc < hdesc_end; ++hdesc) {
+			hdesc->hist = hunit;
 			hunit += slot_n;
 		}
 	}
 
 	/*
-	 * Set up the initial ratio data. For dynamic ratios it's all
-	 * equal initial weights.
+	 * Calculate the initial ratio data for each server. That's
+	 * based on equal initial (default) weights that are set by
+	 * the configuration processing routines.
 	 */
-	if (!(sg->flags & (TFW_SG_F_SCHED_RATIO_STATIC
-			   | TFW_SG_F_SCHED_RATIO_DYNAMIC
-			   | TFW_SG_F_SCHED_RATIO_PREDICT)))
-	{
-		ret = -EINVAL;
-		goto cleanup;
-	}
-
-	/* Calculate initial ratios for each server. */
-	rcu_assign_pointer(rpool->ratio, tfw_sched_ratio_rpool_get(rpool));
-	tfw_sched_ratio_calc_static(rpool, rpool->ratio);
+	tfw_sched_ratio_calc_static(ratio, ratio->rtodata);
 
 	/* Set up periodic re-calculation of ratios. */
 	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
-		rpool->intvl = TFW_SCHED_RATIO_INTVL;
-		atomic_set(&rpool->rearm, 1);
+		ratio->intvl = TFW_SCHED_RATIO_INTVL;
+		atomic_set(&ratio->rearm, 1);
 		smp_mb__after_atomic();
-		setup_timer(&rpool->timer,
+		setup_timer(&ratio->timer,
 			    tfw_sched_ratio_dynamic_tmfn, (unsigned long)sg);
-		mod_timer(&rpool->timer, jiffies + rpool->intvl);
+		mod_timer(&ratio->timer, jiffies + ratio->intvl);
 	} else if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) {
-		TfwSchrefPredict *schref = sched_data;
-		rpool->intvl = msecs_to_jiffies(1000 / schref->rate);
-		atomic_set(&rpool->rearm, 1);
+		ratio->intvl = msecs_to_jiffies(1000 / schref->rate);
+		atomic_set(&ratio->rearm, 1);
 		smp_mb__after_atomic();
-		setup_timer(&rpool->timer,
+		setup_timer(&ratio->timer,
 			    tfw_sched_ratio_predict_tmfn, (unsigned long)sg);
-		mod_timer(&rpool->timer, jiffies + rpool->intvl);
+		mod_timer(&ratio->timer, jiffies + ratio->intvl);
+	}
+
+	return 0;
+}
+
+static int
+tfw_sched_ratio_add_grp(TfwSrvGroup *sg)
+{
+	int ret;
+
+	if (unlikely(!sg->srv_n || list_empty(&sg->srv_list)))
+		return -EINVAL;
+
+	switch (sg->flags & TFW_SG_M_SCHED_RATIO_TYPE) {
+	case TFW_SG_F_SCHED_RATIO_STATIC:
+		if ((ret = tfw_sched_ratio_add_grp_static(sg)))
+			goto cleanup;
+		break;
+	case TFW_SG_F_SCHED_RATIO_DYNAMIC:
+	case TFW_SG_F_SCHED_RATIO_PREDICT:
+		if ((ret = tfw_sched_ratio_add_grp_dynamic(sg)))
+			goto cleanup;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	return 0;
diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h
index c9a4b22b9..d871e53e0 100644
--- a/tempesta_fw/server.h
+++ b/tempesta_fw/server.h
@@ -106,10 +106,13 @@ typedef struct {
 /* Server and server group related flags.
  * Lower 4 bits keep an index into APM stats array.
  */
-#define TFW_SG_F_PSTATS_IDX_MASK	0x000f
+#define TFW_SG_M_PSTATS_IDX		0x000f
 #define TFW_SG_F_SCHED_RATIO_STATIC	0x0010
 #define TFW_SG_F_SCHED_RATIO_DYNAMIC	0x0020
 #define TFW_SG_F_SCHED_RATIO_PREDICT	0x0040
+#define TFW_SG_M_SCHED_RATIO_TYPE	(TFW_SG_F_SCHED_RATIO_STATIC	\
+					 | TFW_SG_F_SCHED_RATIO_DYNAMIC	\
+					 | TFW_SG_F_SCHED_RATIO_PREDICT)
 
 #define TFW_SRV_RETRY_NIP		0x0100	/* Retry non-idemporent req. */
 #define TFW_SRV_STICKY			0x0200	/* Use sticky sessions. */
diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index 6ad8e6081..a69cf2225 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -934,7 +934,9 @@ tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg)
 	TfwServer *srv;
 	int count = 0;
 
-	if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) {
+	if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
+			 || TFW_SG_F_SCHED_RATIO_PREDICT))
+	{
 		list_for_each_entry(srv, tfw_cfg_slst, list) {
 			if (srv->weight)
 				break;
@@ -975,7 +977,8 @@ tfw_cfgop_setup_srv_group(void)
 	 * Check 'ratio' scheduler configuration for incompatibilities.
 	 * Set weight to default value for each server in the group
 	 * if no weight is provided in the configuration. For dynamic
-	 * ratio this sets initial equal weights to all servers.
+	 * or predictive ratios this sets initial equal weights to all
+	 * servers.
 	 */
 	if (!strcasecmp(tfw_cfg_sched->name, "ratio")) {
 		if (tfw_cfg_sg_ratio_verify(tfw_cfg_sg))
@@ -1481,7 +1484,7 @@ TfwCfgMod tfw_sock_srv_cfg_mod = {
 int
 tfw_sock_srv_init(void)
 {
-	BUILD_BUG_ON(_TFW_PSTATS_IDX_COUNT > TFW_SG_F_PSTATS_IDX_MASK);
+	BUILD_BUG_ON(_TFW_PSTATS_IDX_COUNT > TFW_SG_M_PSTATS_IDX);
 	BUG_ON(tfw_srv_conn_cache);
 	tfw_srv_conn_cache = kmem_cache_create("tfw_srv_conn_cache",
 					       sizeof(TfwSrvConn), 0, 0, NULL);

From d893c060e1430aabc66b76787b77b52134a5226c Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Sun, 28 May 2017 23:54:36 +0300
Subject: [PATCH 33/37] Don't mix signed and unsigned types in calculations.

The algorithm is brought up to the same code as in
the Ratio Scheduler (predictive algorithm).
---
 tempesta_fw/t/unit/user_space/Makefile |  3 +-
 tempesta_fw/t/unit/user_space/slr.cc   | 73 ++++++++++++++++++--------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/tempesta_fw/t/unit/user_space/Makefile b/tempesta_fw/t/unit/user_space/Makefile
index 212f114b8..bdec6b733 100644
--- a/tempesta_fw/t/unit/user_space/Makefile
+++ b/tempesta_fw/t/unit/user_space/Makefile
@@ -30,6 +30,7 @@ CACHELINE := $(shell getconf LEVEL1_DCACHE_LINESIZE)
 CFLAGS		= -O0 -ggdb -Wall -Werror \
 		  -pthread -DL1_CACHE_BYTES=$(CACHELINE) \
 		  -I../../../../ktest
+CXXFLAGS	= -std=c++11 ${CFLAGS}
 TARGETS		= alb percentiles slr
 
 all : $(TARGETS)
@@ -41,7 +42,7 @@ alb : alb.c
 	$(CC) $(CFLAGS) -o $@ $^
 
 slr : slr.cc
-	$(CXX) $(CFLAGS) -o $@ $^
+	$(CXX) $(CXXFLAGS) -o $@ $^
 
 clean : FORCE
 	rm -f *.o *~ *.orig $(TARGETS)
diff --git a/tempesta_fw/t/unit/user_space/slr.cc b/tempesta_fw/t/unit/user_space/slr.cc
index afb4c05eb..7974e5acc 100644
--- a/tempesta_fw/t/unit/user_space/slr.cc
+++ b/tempesta_fw/t/unit/user_space/slr.cc
@@ -24,14 +24,13 @@
 
 #include <iostream>
 
-template<class T>
+template<class T, const long wsz>
 class SLR {
-	static const long WSZ	= 5;
 	// Use the multiplier to calculate @y with 1/MUL
 	// precission on integer arithmetic. */
 	static const long MUL	= 1000;
 
-	unsigned long n; /* observation number */
+	long n; /* observation number */
 	T x_avg, y_avg;
 	T xy_avg; /* avg(x * y) */
 	T x_avg_y_avg; /* avg(x) * avg(y) */
@@ -41,7 +40,7 @@ class SLR {
 	struct {
 		T x;
 		T y;
-	} win[WSZ];
+	} win[wsz];
 
 public:
 	SLR()
@@ -52,30 +51,30 @@ class SLR {
 	void
 	slr_upd(long x, long y)
 	{
-		size_t ni, cnt;
+		int ni, sz;
 
 		y *= MUL;
 		x *= MUL;
+		ni = n % wsz;
 
-		if (n < WSZ) {
-			ni = n;
-			cnt = n + 1;
-			x_avg = (x_avg * n + x) / cnt;
-			y_avg = (y_avg * n + y) / cnt;
-			xy_avg = (xy_avg * n + y * x) / cnt;
+		if (n < wsz) {
+			sz = ni + 1;
+			x_avg = (x_avg * n + x) / sz;
+			y_avg = (y_avg * n + y) / sz;
+			xy_avg = (xy_avg * n + y * x) / sz;
 			x_avg_y_avg = x_avg * y_avg;
-			x_sq_avg = (x_sq_avg * n + x * x) / cnt;
+			x_sq_avg = (x_sq_avg * n + x * x) / sz;
 			x_avg_sq = x_avg * x_avg;
 		} else {
 			// Forget history before the window
 			// to adopt to new pattern.
-			ni = n % WSZ;
-			x_avg = x_avg - (win[ni].x - x) / WSZ;
-			y_avg = y_avg - (win[ni].y - y) / WSZ;
-			xy_avg = xy_avg - (win[ni].x * win[ni].y - y * x) / WSZ;
+			sz = wsz;
+			x_avg = x_avg - (win[ni].x - x) / sz;
+			y_avg = y_avg - (win[ni].y - y) / sz;
+			xy_avg = xy_avg - (win[ni].x * win[ni].y - y * x) / sz;
 			x_avg_y_avg = x_avg * y_avg;
 			x_sq_avg = x_sq_avg - (win[ni].x * win[ni].x - x * x)
-					      / WSZ;
+					      / sz;
 			x_avg_sq = x_avg * x_avg;
 		}
 
@@ -127,11 +126,11 @@ class SLR {
 	}
 };
 
-template<class T>
+template<class T, const long wsz>
 void
 test()
 {
-	SLR<T> slr;
+	SLR<T, wsz> slr;
 
 	slr.add_data(1, 3);
 	slr.add_data(2, 5);
@@ -149,14 +148,44 @@ test()
 	slr.predict(15);
 }
 
+// The major thing this test verifies is that the calculations
+// don't break when they're switched from working on partial
+// history to working on full-size history.
+template<class T, const long wsz>
+void
+test_verified()
+{
+	SLR<T, wsz> slr;
+
+	slr.add_data(1, 1);
+	slr.add_data(2, 1);
+	slr.add_data(3, 1);
+	slr.add_data(4, 1);
+	slr.add_data(5, 1);
+	slr.add_data(6, 1);
+	slr.add_data(7, 1);
+	slr.add_data(8, 1);
+	slr.add_data(9, 1);
+	slr.add_data(10, 1);
+	slr.add_data(11, 1);
+	slr.add_data(12, 1);
+	slr.add_data(13, 1);
+
+	slr.predict(15);
+}
+
 int
 main(int argc, char *argv[])
 {
 	std::cout << "TEST for double" << std::endl;
-	test<double>();
+	test<double, 5>();
 
 	std::cout << "TEST for long" << std::endl;
-	test<long>();
+	test<long, 5>();
+
+	std::cout << "Verified test for long, the result should be '1'" << std::endl;
+	test_verified<long, 8>();
+
 	return 0;
 
 	std::cout << std::endl;
@@ -170,7 +199,7 @@ main(int argc, char *argv[])
 	std::cout << "> ";
 
 	long x, y, pred_x;
-	SLR<double> slr;
+	SLR<double, 5> slr;
 	while (std::cin >> x >> y >> pred_x) {
 		slr.slr_upd(x, y);
 		std::cout << "(x=" << x << " y=" << y

From 5c37ed18ca8d76ae7094b0d9d6dcc34b4d7988b1 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Mon, 29 May 2017 00:09:50 +0300
Subject: [PATCH 34/37] A bit more data for servers in
 /proc/tempesta/servers/*.

Add the number of schedulable connections for each server.
---
 tempesta_fw/procfs.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tempesta_fw/procfs.c b/tempesta_fw/procfs.c
index 561bda9b1..8b18a0da1 100644
--- a/tempesta_fw/procfs.c
+++ b/tempesta_fw/procfs.c
@@ -146,9 +146,10 @@ tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 {
 #define SPRNE(m, e)	seq_printf(seq, m": %dms\n", e)
 
-	int i;
+	size_t i, rc;
 	TfwSrvConn *srv_conn;
 	TfwServer *srv = seq->private;
+	unsigned int qsize[srv->conn_n];
 	unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 };
 	TfwPrcntlStats pstats = {
 		.ith = tfw_pstats_ith,
@@ -167,12 +168,21 @@ tfw_srvstats_seq_show(struct seq_file *seq, void *off)
 	for (i = TFW_PSTATS_IDX_ITH; i < ARRAY_SIZE(tfw_pstats_ith); ++i)
 		seq_printf(seq, "%02d%%:\t%dms\n",
 				pstats.ith[i], pstats.val[i]);
-	i = 0;
+
+	i = rc = 0;
+	list_for_each_entry(srv_conn, &srv->conn_list, list) {
+		qsize[i++] = READ_ONCE(srv_conn->qsize);
+		if (tfw_srv_conn_restricted(srv_conn))
+			rc++;
+	}
+
+	seq_printf(seq, "Total schedulable connections\t: %zd\n",
+			srv->conn_n - rc);
 	seq_printf(seq, "Maximum forwarding queue size\t: %d\n",
 			srv->sg->max_qsize);
-	list_for_each_entry(srv_conn, &srv->conn_list, list)
-		seq_printf(seq, "\tConnection %03d queue size\t: %d\n",
-				++i, ACCESS_ONCE(srv_conn->qsize));
+	for (i = 0; i < srv->conn_n; ++i)
+		seq_printf(seq, "\tConnection %03zd queue size\t: %d\n",
+				i, qsize[i]);
 
 	return 0;
 #undef SPRNE

From a333f4c1e05dd9ca0ec3c59759db64aa83462aee Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Thu, 1 Jun 2017 16:55:31 +0300
Subject: [PATCH 35/37] Rework APM modules to work with per-CPU arrays for
 incoming data.

Updates are stored in per-CPU arrays for each server. Processing
of accumulated updates is done by a single thread that submits
the updates and then runs the calculation of percentiles. That
removes the concurrency between the updates and the calculation
of percentiles.

Different arrays are used for accumulating incoming update data
and for processing the accumulated data.
---
 tempesta_fw/apm.c | 343 +++++++++++++++++++++-------------------------
 1 file changed, 158 insertions(+), 185 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 78d2b3956..3df88bfb4 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -380,6 +380,19 @@ typedef struct {
 	atomic_t	reset;
 } TfwApmRBEnt;
 
+/*
+ * The ring buffer structure.
+ *
+ * @rbent	- Array of ring buffer entries.
+ * @slock	- The lock to adjust the ranges in the current entry.
+ * @rbufsz	- The size of @rbent.
+ */
+typedef struct {
+	TfwApmRBEnt	*rbent;
+	spinlock_t	slock;
+	int		rbufsz;
+} TfwApmRBuf;
+
 /*
  * The ring buffer contol structure.
  *
@@ -397,19 +410,6 @@ typedef struct {
 	unsigned long	total_cnt;
 } TfwApmRBCtl;
 
-/*
- * The ring buffer structure.
- *
- * @rbent	- Array of ring buffer entries.
- * @slock	- The lock to adjust the ranges in the current entry.
- * @rbufsz	- The size of @rbent.
- */
-typedef struct {
-	TfwApmRBEnt	*rbent;
-	spinlock_t	slock;
-	int		rbufsz;
-} TfwApmRBuf;
-
 /*
  * The stats entry data structure.
  * Keeps the latest values of calculated percentiles.
@@ -442,6 +442,45 @@ typedef struct {
 	atomic_t	rdidx;
 } TfwApmStats;
 
+/*
+ * An update buffer entry that holds RTT data for updates.
+ *
+ * The value of @centry depends on @jtstamp that comes as part of data
+ * for the update. Ideally, @jtstamp and @rtt would be stored instead
+ * of @centry and @rtt. However, together they occupy more than 64 bits,
+ * and it's highly desirable to read/write them in a single operation.
+ *
+ * @centry	- The entry number in the array of ring buffer entries.
+ * @rtt		- The RTT of the message, in milliseconds.
+ */
+typedef union {
+	struct {
+		unsigned int	centry;
+		unsigned int	rtt;
+	} __attribute__((packed));
+	uint64_t		data;
+} TfwApmUBEnt;
+
+/*
+ * The buffer that holds RTT data for updates, per CPU.
+ *
+ * The data for an update is stored in an array per CPU. The actual
+ * updates,and then the percentile recalculation is done periodically
+ * by a single thread, which removes concurrency between updates and
+ * the calculation. The data for an update is stored in one array of
+ * the two, while the processing thread processes the accumulated
+ * data in the other array. The switch between these two arrays is
+ * managed by way of @counter by the processing thread.
+ *
+ * @ubent	- Arrays of ring buffer entries (flip-flop manner).
+ * @ubufsz	- The size of @ubent.
+ * @counter	- The counter that controls which @ubent to use.
+ */
+typedef struct {
+	TfwApmUBEnt	*ubent[2];
+	size_t		ubufsz;
+	atomic64_t	counter;
+} TfwApmUBuf;
 /*
  * APM Data structure.
  *
@@ -451,26 +490,26 @@ typedef struct {
  * If there are several different parties that do the calculation,
  * then the data may need to be organized differently.
  *
- * @list	- Member in @tfw_apm_qcalc or @tfw_apm_qrecalc.
  * @rbuf	- The ring buffer for the specified time window.
  * @rbctl	- The control data helpful in taking optimizations.
  * @stats	- The latest percentiles.
+ * @ubuf	- The buffer that holds data for updates, per CPU.
  * @timer	- The periodic timer handle.
  * @flags	- The atomic flags (see below).
- * @refcnt	- The reference count.
  */
-#define TFW_APM_DATA_F_RECALC	(0x0001)	/* Need to recalculate. */
-#define TFW_APM_DATA_F_UPDONE	(0x0002)	/* RTT update done. */
-#define TFW_APM_TIMER_TIMEOUT	(HZ/20)		/* The timer periodicity. */
+#define TFW_APM_DATA_F_REARM	(0x0001)	/* Re-arm the timer. */
+#define TFW_APM_DATA_F_RECALC	(0x0002)	/* Need to recalculate. */
+
+#define TFW_APM_TIMER_INTVL	(HZ / 20)
+#define TFW_APM_UBUF_SZ		TFW_APM_TIMER_INTVL	/* a slot per ms. */
 
 typedef struct {
-	struct list_head	list;
 	TfwApmRBuf		rbuf;
 	TfwApmRBCtl		rbctl;
 	TfwApmStats		stats;
+	TfwApmUBuf __percpu	*ubuf;
 	struct timer_list	timer;
 	unsigned long		flags;
-	atomic_t		refcnt;
 } TfwApmData;
 
 /*
@@ -488,30 +527,6 @@ static int tfw_apm_jtmwindow;		/* Time window in jiffies. */
 static int tfw_apm_jtmintrvl;		/* Time interval in jiffies. */
 static int tfw_apm_tmwscale;		/* Time window scale. */
 
-/* Work Queue item for stats data. */
-typedef struct {
-	TfwApmData	*data;
-	unsigned long	jtstamp;
-	unsigned long	rtt;
-	unsigned long	__pad;
-} TfwApmWqItem;
-
-/* A Work Queue on each CPU. */
-static DEFINE_PER_CPU(TfwRBQueue, tfw_apm_wq);
-
-/*
- * @tfw_apm_qcalc	- List of servers that require stats calculation.
- * @tfw_apm_qrecalc	- List of servers that require stats re-calculation.
- * @tfw_apm_rearm	- Atomic flag, tells if the timer needs re-arming.
- * @tfw_apm_timer	- The periodic timer handle.
- */
-#define TFW_APM_DATA_F_REARM	(0x0001)	/* Re-arm the timer. */
-
-static struct list_head tfw_apm_qcalc;
-static struct list_head tfw_apm_qrecalc;
-static unsigned long tfw_apm_rearm;
-static struct timer_list tfw_apm_timer;
-
 /*
  * Get the next bucket in the ring buffer entry that has a non-zero
  * hits count. Set the bucket's sequential number, the range number,
@@ -877,30 +892,6 @@ tfw_apm_pstats_verify(TfwPrcntlStats *pstats)
 	return 0;
 }
 
-static inline void
-tfw_apm_data_put(TfwApmData *data)
-{
-	if (atomic_dec_and_test(&data->refcnt))
-		kfree(data);
-}
-
-static inline void
-tfw_apm_data_get(TfwApmData *data)
-{
-	atomic_inc(&data->refcnt);
-}
-
-static inline void
-__tfw_apm_update(TfwApmRBuf *rbuf, unsigned long jtstamp, unsigned int rtt)
-{
-	int centry = (jtstamp / tfw_apm_jtmintrvl) % rbuf->rbufsz;
-	unsigned long jtmistart = jtstamp - (jtstamp % tfw_apm_jtmintrvl);
-	TfwApmRBEnt *crbent = &rbuf->rbent[centry];
-
-	tfw_apm_rbent_checkreset(crbent, jtmistart);
-	tfw_stats_update(&crbent->pcntrng, rtt);
-}
-
 /*
  * Calculate the latest percentiles if necessary.
  * Runs periodically on timer.
@@ -908,74 +899,55 @@ __tfw_apm_update(TfwApmRBuf *rbuf, unsigned long jtstamp, unsigned int rtt)
 static void
 tfw_apm_prcntl_tmfn(unsigned long fndata)
 {
-	int cpu, interval = TFW_APM_TIMER_TIMEOUT;
-	TfwApmData *data, *tmp;
+	int i, icpu, updone = 0;
+	TfwApmData *data = (TfwApmData *)fndata;
+	TfwApmRBuf *rbuf = &data->rbuf;
+	TfwApmRBEnt *rbent = rbuf->rbent;
 
-	/* No arguments. */
-	BUG_ON(fndata);
+	BUG_ON(!fndata);
 
 	/*
-	 * Process work queues on all CPUs and update stats with data
-	 * from each work item in the queue. Add servers with updated
-	 * stats to the list for calculation of stats. Each server is
-	 * added to the list just once.
-	 *
-	 * If server's APM data is already on the list, that means it
-	 * is on @qrecalc list. Just remove it from @qrecalc list and
-	 * it will be put on @qcalc list as usual for calculation of
-	 * stats values. Note that this is a highly unlikely case.
-	 *
-	 * Note that if server needs a recalculation of stats values,
-	 * it makes sense only if there were updates to server's stats
-	 * data. If there's no updates then a recalculation will lead
-	 * to the same (insufficient) result.
+	 * Increment the counter and make the updates use the other array
+	 * of the two that are available. In the meanwhile, use the array
+	 * filled with updates to process them and calculate percentiles.
 	 */
-	for_each_online_cpu(cpu) {
-		TfwApmWqItem wq_item;
-		TfwRBQueue *wq = &per_cpu(tfw_apm_wq, cpu);
-
-		while (!tfw_wq_pop(wq, &wq_item)) {
-			data = wq_item.data;
-			__tfw_apm_update(&data->rbuf,
-					 wq_item.jtstamp, wq_item.rtt);
-			if (data->flags & TFW_APM_DATA_F_UPDONE) {
-				tfw_apm_data_put(data);
+	for_each_online_cpu(icpu) {
+		TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu);
+		unsigned long idxval = atomic64_inc_return(&ubuf->counter);
+		TfwApmUBEnt *ubent = ubuf->ubent[(idxval - 1) % 2];
+		TfwApmUBEnt rtt_data;
+
+		for (i = 0; i < ubuf->ubufsz; ++i) {
+			rtt_data.data = READ_ONCE(ubent[i].data);
+			if (rtt_data.data == ULONG_MAX)
 				continue;
-			}
-			if (unlikely(!list_empty(&data->list)))
-				list_del_init(&data->list);
-			data->flags |= TFW_APM_DATA_F_UPDONE;
-			list_add_tail(&data->list, &tfw_apm_qcalc);
+			WRITE_ONCE(ubent[i].data, ULONG_MAX);
+			tfw_stats_update(&rbent[rtt_data.centry].pcntrng,
+					 rtt_data.rtt);
+			++updone;
 		}
 	}
-	/*
-	 * Calculate stats values for each server that has been updated.
-	 * If the calculation cannot be completed with the current data,
-	 * then move that server to a separate list. When stats data is
-	 * updated, the calculation will be repeated.
-	 */
-	list_for_each_entry_safe(data, tmp, &tfw_apm_qcalc, list) {
-		BUG_ON(!(data->flags & TFW_APM_DATA_F_UPDONE));
-		list_del_init(&data->list);
-		data->flags &= ~TFW_APM_DATA_F_UPDONE;
-		if (unlikely(tfw_apm_calc(data))) {
-			list_add_tail(&data->list, &tfw_apm_qrecalc);
-			continue;
-		}
-		tfw_apm_data_put(data);
+	if (updone && unlikely(tfw_apm_calc(data))) {
+		TFW_DBG2("%s: Incomplete calculation\n", __func__);
 	}
 
-	/*
-	 * Recalculation of stats values is needed for some servers.
-	 * Do it ASAP in anticipation that will be updates to stats
-	 * data for those servers.
-	 */
-	if (unlikely(!list_empty(&tfw_apm_qrecalc)))
-		interval = 1;
-
 	smp_mb();
-	if (test_bit(TFW_APM_DATA_F_REARM, &tfw_apm_rearm))
-		mod_timer(&tfw_apm_timer, jiffies + interval);
+	if (test_bit(TFW_APM_DATA_F_REARM, &data->flags))
+		mod_timer(&data->timer, jiffies + TFW_APM_TIMER_INTVL);
+}
+
+static void
+__tfw_apm_update(TfwApmData *data, unsigned long jtstamp, unsigned long rtt)
+{
+	TfwApmUBuf *ubuf = this_cpu_ptr(data->ubuf);
+	unsigned long idxval = atomic64_add_return(0, &ubuf->counter);
+	TfwApmUBEnt *ubent = ubuf->ubent[idxval % 2];
+	int centry = (jtstamp / tfw_apm_jtmintrvl) % data->rbuf.rbufsz;
+	unsigned long jtmistart = jtstamp - (jtstamp % tfw_apm_jtmintrvl);
+	TfwApmUBEnt rtt_data = { .centry = centry, .rtt = rtt };
+
+	tfw_apm_rbent_checkreset(&data->rbuf.rbent[centry], jtmistart);
+	WRITE_ONCE(ubent[jtstamp % ubuf->ubufsz].data, rtt_data.data);
 }
 
 void
@@ -989,16 +961,21 @@ tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtt)
 	 * the maximum value possible for TfwPcntCtl{}->end. Currently
 	 * the value is USHRT_MAX which is about 65 secs in milliseconds.
 	 */
-	if (likely(rtt < (1UL << FIELD_SIZEOF(TfwPcntCtl, end) * 8))) {
-		TfwApmWqItem wq_item = {
-			.data = apmref,
-			.jtstamp = jtstamp,
-			.rtt = rtt,
-		};
-		tfw_apm_data_get(wq_item.data);
-		if (__tfw_wq_push(this_cpu_ptr(&tfw_apm_wq), &wq_item, 0))
-			tfw_apm_data_put(wq_item.data);
+	if (likely(rtt < (1UL << FIELD_SIZEOF(TfwPcntCtl, end) * 8)))
+		__tfw_apm_update(apmref, jtstamp, rtt);
+}
+
+static void
+tfw_apm_destroy(TfwApmData *data)
+{
+	int icpu;
+
+	for_each_online_cpu(icpu) {
+		TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu);
+		kfree(ubuf->ubent[0]);
 	}
+	free_percpu(data->ubuf);
+	kfree(data);
 }
 
 /*
@@ -1013,13 +990,16 @@ tfw_apm_rbent_init(TfwApmRBEnt *rbent, unsigned long jtmistamp)
 
 /*
  * Create and initialize an APM ring buffer for a server.
+ *
+ * Note that due to specifics of Tempesta start up process this code
+ * is executed in SoftIRQ context (so that sleeping is not allowed).
  */
 void *
 tfw_apm_create(void)
 {
 	TfwApmData *data;
 	TfwApmRBEnt *rbent;
-	int i, size;
+	int i, icpu, size;
 	unsigned int *val[2];
 	int rbufsz = tfw_apm_tmwscale;
 	int psz = ARRAY_SIZE(tfw_pstats_ith);
@@ -1030,11 +1010,19 @@ tfw_apm_create(void)
 	}
 
 	/* Keep complete stats for the full time window. */
-	size = sizeof(TfwApmData) + rbufsz * sizeof(TfwApmRBEnt)
-				  + 2 * psz * sizeof(unsigned int);
+	size = sizeof(TfwApmData)
+		+ rbufsz * sizeof(TfwApmRBEnt)
+		+ 2 * psz * sizeof(unsigned int);
 	if ((data = kzalloc(size, GFP_ATOMIC)) == NULL)
 		return NULL;
 
+	size = sizeof(TfwApmUBuf);
+	data->ubuf = __alloc_percpu_gfp(size, sizeof(int64_t), GFP_ATOMIC);
+	if (!data->ubuf) {
+		kfree(data);
+		return NULL;
+	}
+
 	/* Set up memory areas. */
 	rbent = (TfwApmRBEnt *)(data + 1);
 	val[0] = (unsigned int *)(rbent + rbufsz);
@@ -1060,9 +1048,26 @@ tfw_apm_create(void)
 	rwlock_init(&data->stats.asent[1].rwlock);
 	atomic_set(&data->stats.rdidx, 0);
 
-	INIT_LIST_HEAD(&data->list);
+	size = 2 * sizeof(TfwApmUBEnt) * TFW_APM_UBUF_SZ;
+	for_each_online_cpu(icpu) {
+		TfwApmUBEnt *ubent;
+		TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu);
+		if (!(ubent = kzalloc(size, GFP_ATOMIC)))
+			goto cleanup;
+		ubuf->ubent[0] = ubent;
+		ubuf->ubent[1] = ubent + TFW_APM_UBUF_SZ;
+		ubuf->ubufsz = TFW_APM_UBUF_SZ;
+		for (i = 0; i < ubuf->ubufsz; ++i)
+			WRITE_ONCE(ubuf->ubent[0][i].data, ULONG_MAX);
+		for (i = 0; i < ubuf->ubufsz; ++i)
+			WRITE_ONCE(ubuf->ubent[1][i].data, ULONG_MAX);
+	}
 
 	return data;
+
+cleanup:
+	tfw_apm_destroy(data);
+	return NULL;
 }
 
 int
@@ -1075,7 +1080,11 @@ tfw_apm_add_srv(TfwServer *srv)
 	if (!(data = tfw_apm_create()))
 		return -ENOMEM;
 
-	tfw_apm_data_get(data);
+	/* Start the timer for the percentile calculation. */
+	set_bit(TFW_APM_DATA_F_REARM, &data->flags);
+	setup_timer(&data->timer, tfw_apm_prcntl_tmfn, (unsigned long)data);
+	mod_timer(&data->timer, jiffies + TFW_APM_TIMER_INTVL);
+
 	srv->apmref = data;
 
 	return 0;
@@ -1084,10 +1093,17 @@ tfw_apm_add_srv(TfwServer *srv)
 void
 tfw_apm_del_srv(TfwServer *srv)
 {
-	if (!srv->apmref)
+	TfwApmData *data = srv->apmref;
+
+	if (!data)
 		return;
 
-	tfw_apm_data_put(srv->apmref);
+	/* Stop the timer and the percentile calculation. */
+	clear_bit(TFW_APM_DATA_F_REARM, &data->flags);
+	smp_mb__after_atomic();
+	del_timer_sync(&data->timer);
+
+	tfw_apm_destroy(data);
 	srv->apmref = NULL;
 }
 
@@ -1104,7 +1120,6 @@ tfw_apm_del_srv(TfwServer *srv)
 static int
 tfw_apm_cfg_start(void)
 {
-	int cpu;
 	unsigned int jtmwindow;
 
 	if (!tfw_apm_jtmwindow)
@@ -1142,50 +1157,9 @@ tfw_apm_cfg_start(void)
 	}
 	tfw_apm_jtmwindow = tfw_apm_jtmintrvl * tfw_apm_tmwscale;
 
-	TFW_WQ_CHECKSZ(TfwApmWqItem);
-	for_each_online_cpu(cpu) {
-		TfwRBQueue *wq = &per_cpu(tfw_apm_wq, cpu);
-		tfw_wq_init(wq, cpu_to_node(cpu));
-	}
-
-	tfw_apm_rearm = 0;
-	INIT_LIST_HEAD(&tfw_apm_qcalc);
-	INIT_LIST_HEAD(&tfw_apm_qrecalc);
-
-	/* Start the timer for the percentile calculation. */
-	set_bit(TFW_APM_DATA_F_REARM, &tfw_apm_rearm);
-	setup_timer(&tfw_apm_timer, tfw_apm_prcntl_tmfn, 0UL);
-	mod_timer(&tfw_apm_timer, jiffies + TFW_APM_TIMER_TIMEOUT);
-
 	return 0;
 }
 
-static void
-tfw_apm_cfg_stop(void)
-{
-	int cpu;
-	TfwApmData *data, *tmp;
-
-	clear_bit(TFW_APM_DATA_F_REARM, &tfw_apm_rearm);
-	smp_mb__after_atomic();
-	del_timer_sync(&tfw_apm_timer);
-
-	for_each_online_cpu(cpu) {
-		TfwApmWqItem wq_item;
-		TfwRBQueue *wq = &per_cpu(tfw_apm_wq, cpu);
-
-		while (!tfw_wq_pop(wq, &wq_item))
-			tfw_apm_data_put(wq_item.data);
-
-		tfw_wq_destroy(wq);
-	}
-	list_for_each_entry_safe(data, tmp, &tfw_apm_qrecalc, list) {
-		list_del_init(&data->list);
-		tfw_apm_data_put(data);
-	}
-	BUG_ON(!list_empty(&tfw_apm_qcalc));
-}
-
 /**
  * Cleanup the configuration values when when all server groups are stopped
  * and the APM timers are deleted.
@@ -1243,6 +1217,5 @@ static TfwCfgSpec tfw_apm_cfg_specs[] = {
 TfwCfgMod tfw_apm_cfg_mod = {
 	.name  = "apm",
 	.start = tfw_apm_cfg_start,
-	.stop = tfw_apm_cfg_stop,
 	.specs = tfw_apm_cfg_specs,
 };

From d8345878b4a198385b7f5306dc1f7ea70fe58bcd Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Fri, 2 Jun 2017 18:08:03 +0300
Subject: [PATCH 36/37] Add list_head argument to tfw_cfg_sg_ratio_adjust() for
 unit tests.

Consistently use global variables in configuration processing code.
---
 tempesta_fw/sock_srv.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c
index a69cf2225..20d317238 100644
--- a/tempesta_fw/sock_srv.c
+++ b/tempesta_fw/sock_srv.c
@@ -720,20 +720,6 @@ tfw_cfgop_out_conn_retries(TfwCfgSpec *cs, TfwCfgEntry *ce)
 	return tfw_cfgop_intval(cs, ce, &tfw_cfg_out_cns_retries);
 }
 
-static int
-tfw_cfgop_set_conn_retries(TfwSrvGroup *sg, int recns)
-{
-	if (!recns) {
-		sg->max_recns = UINT_MAX;
-	} else if (recns < ARRAY_SIZE(tfw_srv_tmo_vals)) {
-		sg->max_recns = ARRAY_SIZE(tfw_srv_tmo_vals);
-	} else {
-		sg->max_recns = recns;
-	}
-
-	return 0;
-}
-
 /* Default and maximum values for "server" options. */
 #define TFW_CFG_SRV_CONNS_N_DEF		32	/* Default # of connections */
 #define TFW_CFG_SRV_WEIGHT_MIN		1	/* Min static weight value */
@@ -918,24 +904,24 @@ tfw_cfgop_begin_srv_group(TfwCfgSpec *cs, TfwCfgEntry *ce)
 }
 
 static int
-tfw_cfg_sg_ratio_adjust(TfwSrvGroup *sg)
+tfw_cfg_sg_ratio_adjust(struct list_head *slst)
 {
 	TfwServer *srv;
 
-	list_for_each_entry(srv, tfw_cfg_slst, list)
+	list_for_each_entry(srv, slst, list)
 		if (!srv->weight)
 			srv->weight = TFW_CFG_SRV_WEIGHT_DEF;
 	return 0;
 }
 
 static int
-tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg)
+tfw_cfg_sg_ratio_verify(void)
 {
 	TfwServer *srv;
 	int count = 0;
 
-	if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
-			 || TFW_SG_F_SCHED_RATIO_PREDICT))
+	if (tfw_cfg_sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
+				 || TFW_SG_F_SCHED_RATIO_PREDICT))
 	{
 		list_for_each_entry(srv, tfw_cfg_slst, list) {
 			if (srv->weight)
@@ -945,7 +931,7 @@ tfw_cfg_sg_ratio_verify(TfwSrvGroup *sg)
 		if (count < tfw_cfg_slstsz) {
 			TFW_ERR_NL("srv_group %s: static weight [%d] used "
 				   "with 'dynamic' scheduler option\n",
-				   sg->name, srv->weight);
+				   tfw_cfg_sg->name, srv->weight);
 			return -EINVAL;
 		}
 	}
@@ -962,12 +948,15 @@ tfw_cfgop_setup_srv_group(void)
 	BUG_ON(!tfw_cfg_sg);
 	BUG_ON(!tfw_cfg_sched);
 
-	tfw_cfgop_set_conn_retries(tfw_cfg_sg, tfw_cfg_cns_retries);
 	tfw_cfg_sg->max_qsize = tfw_cfg_queue_size ? : UINT_MAX;
 	tfw_cfg_sg->max_jqage = tfw_cfg_fwd_timeout
 			      ? msecs_to_jiffies(tfw_cfg_fwd_timeout * 1000)
 			      : ULONG_MAX;
 	tfw_cfg_sg->max_refwd = tfw_cfg_fwd_retries ? : UINT_MAX;
+	tfw_cfg_sg->max_recns = tfw_cfg_cns_retries
+			      ? max_t(int, tfw_cfg_cns_retries,
+				      ARRAY_SIZE(tfw_srv_tmo_vals))
+			      : UINT_MAX;
 
 	tfw_cfg_sg->flags = tfw_cfg_sg_flags;
 	tfw_cfg_sg->flags |= tfw_cfg_retry_nip | tfw_cfg_sticky_sess;
@@ -981,9 +970,9 @@ tfw_cfgop_setup_srv_group(void)
 	 * servers.
 	 */
 	if (!strcasecmp(tfw_cfg_sched->name, "ratio")) {
-		if (tfw_cfg_sg_ratio_verify(tfw_cfg_sg))
+		if (tfw_cfg_sg_ratio_verify())
 			return -EINVAL;
-		if (tfw_cfg_sg_ratio_adjust(tfw_cfg_sg))
+		if (tfw_cfg_sg_ratio_adjust(tfw_cfg_slst))
 			return -EINVAL;
 	}
 	/* Set up the server group with all servers that are in it. */

From a044aaee7fbf7244de455d7091740e60fc4df474 Mon Sep 17 00:00:00 2001
From: Aleksey Baulin <ab@natsys-lab.com>
Date: Sat, 3 Jun 2017 16:01:42 +0300
Subject: [PATCH 37/37] Small cleanups in apm.c.

---
 tempesta_fw/apm.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c
index 3df88bfb4..d863b9001 100644
--- a/tempesta_fw/apm.c
+++ b/tempesta_fw/apm.c
@@ -29,7 +29,6 @@
 #include "log.h"
 #include "pool.h"
 #include "procfs.h"
-#include "work_queue.h"
 
 /*
  * The algorithm is constructed to be as efficient as possible. That's
@@ -1048,19 +1047,18 @@ tfw_apm_create(void)
 	rwlock_init(&data->stats.asent[1].rwlock);
 	atomic_set(&data->stats.rdidx, 0);
 
-	size = 2 * sizeof(TfwApmUBEnt) * TFW_APM_UBUF_SZ;
+	size = 2 * TFW_APM_UBUF_SZ * sizeof(TfwApmUBEnt);
 	for_each_online_cpu(icpu) {
 		TfwApmUBEnt *ubent;
 		TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu);
-		if (!(ubent = kzalloc(size, GFP_ATOMIC)))
+		ubent = kmalloc_node(size, GFP_ATOMIC, cpu_to_node(icpu));
+		if (!ubent)
 			goto cleanup;
+		for (i = 0; i < 2 * TFW_APM_UBUF_SZ; ++i)
+			WRITE_ONCE(ubent[i].data, ULONG_MAX);
 		ubuf->ubent[0] = ubent;
 		ubuf->ubent[1] = ubent + TFW_APM_UBUF_SZ;
 		ubuf->ubufsz = TFW_APM_UBUF_SZ;
-		for (i = 0; i < ubuf->ubufsz; ++i)
-			WRITE_ONCE(ubuf->ubent[0][i].data, ULONG_MAX);
-		for (i = 0; i < ubuf->ubufsz; ++i)
-			WRITE_ONCE(ubuf->ubent[1][i].data, ULONG_MAX);
 	}
 
 	return data;