From f75159e9936143177b442afc78150b7a7ad8aa07 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 20 Sep 2011 01:25:41 +0000
Subject: [PATCH 01/15] ptp: fix L2 event message recognition

The IEEE 1588 standard defines two kinds of messages, event and general
messages. Event messages require time stamping, and general do not. When
using UDP transport, two separate ports are used for the two message
types.

The BPF designed to recognize event messages incorrectly classifies L2
general messages as event messages. This commit fixes the issue by
extending the filter to check the message type field for L2 PTP packets.
Event messages are be distinguished from general messages by testing
the "general" bit.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: <stable@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_classify.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h
index e07e2742a865a4..1dc420ba213a52 100644
--- a/include/linux/ptp_classify.h
+++ b/include/linux/ptp_classify.h
@@ -51,6 +51,7 @@
 #define PTP_CLASS_V2_VLAN (PTP_CLASS_V2 | PTP_CLASS_VLAN)
 
 #define PTP_EV_PORT 319
+#define PTP_GEN_BIT 0x08 /* indicates general message, if set in message type */
 
 #define OFF_ETYPE	12
 #define OFF_IHL		14
@@ -116,14 +117,20 @@ static inline int ptp_filter_init(struct sock_filter *f, int len)
 	{OP_OR,		0,   0, PTP_CLASS_IPV6		}, /*              */ \
 	{OP_RETA,	0,   0, 0			}, /*              */ \
 /*L3x*/	{OP_RETK,	0,   0, PTP_CLASS_NONE		}, /*              */ \
-/*L40*/	{OP_JEQ,	0,   6, ETH_P_8021Q		}, /* f goto L50   */ \
+/*L40*/	{OP_JEQ,	0,   9, ETH_P_8021Q		}, /* f goto L50   */ \
 	{OP_LDH,	0,   0, OFF_ETYPE + 4		}, /*              */ \
-	{OP_JEQ,	0,   9, ETH_P_1588		}, /* f goto L60   */ \
+	{OP_JEQ,	0,  15, ETH_P_1588		}, /* f goto L60   */ \
+	{OP_LDB,	0,   0, ETH_HLEN + VLAN_HLEN	}, /*              */ \
+	{OP_AND,	0,   0, PTP_GEN_BIT		}, /*              */ \
+	{OP_JEQ,	0,  12, 0			}, /* f goto L6x   */ \
 	{OP_LDH,	0,   0, ETH_HLEN + VLAN_HLEN	}, /*              */ \
 	{OP_AND,	0,   0, PTP_CLASS_VMASK		}, /*              */ \
 	{OP_OR,		0,   0, PTP_CLASS_VLAN		}, /*              */ \
 	{OP_RETA,	0,   0, 0			}, /*              */ \
-/*L50*/	{OP_JEQ,	0,   4, ETH_P_1588		}, /* f goto L61   */ \
+/*L50*/	{OP_JEQ,	0,   7, ETH_P_1588		}, /* f goto L61   */ \
+	{OP_LDB,	0,   0, ETH_HLEN		}, /*              */ \
+	{OP_AND,	0,   0, PTP_GEN_BIT		}, /*              */ \
+	{OP_JEQ,	0,   4, 0			}, /* f goto L6x   */ \
 	{OP_LDH,	0,   0, ETH_HLEN		}, /*              */ \
 	{OP_AND,	0,   0, PTP_CLASS_VMASK		}, /*              */ \
 	{OP_OR,		0,   0, PTP_CLASS_L2		}, /*              */ \

From 02715ed2e7b71569ed6e61f0b23b0736a509913d Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 20 Sep 2011 01:25:42 +0000
Subject: [PATCH 02/15] dp83640: reduce driver noise

The driver has two warning messages that might be triggered
by normal use cases. When they appear, the messages give the
impression of a never ending series of errors.

This commit changes them to debug messages instead.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/dp83640.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index cb6e0b486b1e22..edd7304773eb87 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -589,7 +589,7 @@ static void decode_rxts(struct dp83640_private *dp83640,
 	prune_rx_ts(dp83640);
 
 	if (list_empty(&dp83640->rxpool)) {
-		pr_warning("dp83640: rx timestamp pool is empty\n");
+		pr_debug("dp83640: rx timestamp pool is empty\n");
 		goto out;
 	}
 	rxts = list_first_entry(&dp83640->rxpool, struct rxts, list);
@@ -612,7 +612,7 @@ static void decode_txts(struct dp83640_private *dp83640,
 	skb = skb_dequeue(&dp83640->tx_queue);
 
 	if (!skb) {
-		pr_warning("dp83640: have timestamp but tx_queue empty\n");
+		pr_debug("dp83640: have timestamp but tx_queue empty\n");
 		return;
 	}
 	ns = phy2txts(phy_txts);

From d3c52173be57ec920deda28923d80b68e5630594 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Wed, 21 Sep 2011 22:08:26 +0000
Subject: [PATCH 03/15] MAINTAINERS: tehuti: Alexander Indenbaum's address
 bounces

I got:
	Generating server: Tehuti.onmicrosoft.com

	baum@tehutinetworks.net
	#< #5.1.1 smtp;550 5.1.1 RESOLVER.ADR.RecipNotFound; not found> #SMTP#

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Alexander Indenbaum <baum@tehutinetworks.net>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ae8820e173a217..ace8f9c81b96a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6374,7 +6374,6 @@ S:	Supported
 F:	arch/arm/mach-tegra
 
 TEHUTI ETHERNET DRIVER
-M:	Alexander Indenbaum <baum@tehutinetworks.net>
 M:	Andy Gospodarek <andy@greyhouse.net>
 L:	netdev@vger.kernel.org
 S:	Supported

From aabdcb0b553b9c9547b1a506b34d55a764745870 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 23 Sep 2011 08:23:47 +0000
Subject: [PATCH 04/15] can bcm: fix tx_setup off-by-one errors

This patch fixes two off-by-one errors that canceled each other out.
Checking for the same condition two times in bcm_tx_timeout_tsklet() reduced
the count of frames to be sent by one. This did not show up the first time
tx_setup is invoked as an additional frame is sent due to TX_ANNONCE.
Invoking a second tx_setup on the same item led to a reduced (by 1) number of
sent frames.

Reported-by: Andre Naujoks <nautsch@gmail.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/can/bcm.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/can/bcm.c b/net/can/bcm.c
index d6c8ae5b2e6a33..c9cdb8d78e7088 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -365,9 +365,6 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
 
 			bcm_send_to_user(op, &msg_head, NULL, 0);
 		}
-	}
-
-	if (op->kt_ival1.tv64 && (op->count > 0)) {
 
 		/* send (next) frame */
 		bcm_can_tx(op);
@@ -970,8 +967,9 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 		/* spec: send can_frame when starting timer */
 		op->flags |= TX_ANNOUNCE;
 
-		if (op->kt_ival1.tv64 && (op->count > 0)) {
-			/* op->count-- is done in bcm_tx_timeout_handler */
+		/* only start timer when having more frames than sent below */
+		if (op->kt_ival1.tv64 && (op->count > 1)) {
+			/* op->count-- is done in bcm_tx_timeout_tsklet */
 			hrtimer_start(&op->timer, op->kt_ival1,
 				      HRTIMER_MODE_REL);
 		} else
@@ -979,8 +977,11 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 				      HRTIMER_MODE_REL);
 	}
 
-	if (op->flags & TX_ANNOUNCE)
+	if (op->flags & TX_ANNOUNCE) {
 		bcm_can_tx(op);
+		if (op->kt_ival1.tv64 && (op->count > 0))
+			op->count--;
+	}
 
 	return msg_head->nframes * CFSIZ + MHSIZ;
 }

From 6482aa7c120447858da1869197b48eff66435a31 Mon Sep 17 00:00:00 2001
From: Divy Le Ray <divy@chelsio.com>
Date: Sat, 24 Sep 2011 06:11:31 +0000
Subject: [PATCH 05/15] cxgb4: Fix EEH on IBM P7IOC

Fix EEH recovery on new P Series platform by
requesting fundamental reset.

Signed-off-by: Divy Le Ray <divy@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/cxgb4/cxgb4_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c
index c9957b7f17b57f..b4efa292fd6fe7 100644
--- a/drivers/net/cxgb4/cxgb4_main.c
+++ b/drivers/net/cxgb4/cxgb4_main.c
@@ -3712,6 +3712,9 @@ static int __devinit init_one(struct pci_dev *pdev,
 		setup_debugfs(adapter);
 	}
 
+	/* PCIe EEH recovery on powerpc platforms needs fundamental reset */
+	pdev->needs_freset = 1;
+
 	if (is_offload(adapter))
 		attach_ulds(adapter);
 

From 676a1184e8afd4fed7948232df1ff91517400859 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 25 Sep 2011 02:21:30 +0000
Subject: [PATCH 06/15] ipv6: nullify ipv6_ac_list and ipv6_fl_list when
 creating new socket

ipv6_ac_list and ipv6_fl_list from listening socket are inadvertently
shared with new socket created for connection.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3c9fa618b69daa..79cc6469508d36 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1383,6 +1383,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
+		newnp->ipv6_ac_list = NULL;
+		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
 		newnp->mcast_oif   = inet6_iif(skb);
@@ -1447,6 +1449,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	   First: no IPv4 options.
 	 */
 	newinet->inet_opt = NULL;
+	newnp->ipv6_ac_list = NULL;
 	newnp->ipv6_fl_list = NULL;
 
 	/* Clone RX bits */

From 95de86cf5162f78bc5aea80d1a9e5a248196ffaf Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 28 Sep 2011 05:33:43 +0000
Subject: [PATCH 07/15] ibmveth: Fix oops on request_irq failure

If request_irq fails, the ibmveth driver will overwrite
the rc and end up returning a successful rc on its open
function, resulting in an oops later when a packet gets
sent and buffers are not allocated due to the failed open.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ibmveth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 8dd5fccef72529..d393f1e764eda8 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -636,8 +636,8 @@ static int ibmveth_open(struct net_device *netdev)
 		netdev_err(netdev, "unable to request irq 0x%x, rc %d\n",
 			   netdev->irq, rc);
 		do {
-			rc = h_free_logical_lan(adapter->vdev->unit_address);
-		} while (H_IS_LONG_BUSY(rc) || (rc == H_BUSY));
+			lpar_rc = h_free_logical_lan(adapter->vdev->unit_address);
+		} while (H_IS_LONG_BUSY(lpar_rc) || (lpar_rc == H_BUSY));
 
 		goto err_out;
 	}

From 605b91c8f619047ed6d88e24c531d4bdddad46f4 Mon Sep 17 00:00:00 2001
From: "Roy.Li" <rongqing.li@windriver.com>
Date: Wed, 28 Sep 2011 19:51:54 +0000
Subject: [PATCH 08/15] net: Documentation: Fix type of variables

Signed-off-by: Roy.Li <rongqing.li@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 81546990f41ca1..ca5cdcd0f0e368 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1042,7 +1042,7 @@ conf/interface/*:
 	The functional behaviour for certain settings is different
 	depending on whether local forwarding is enabled or not.
 
-accept_ra - BOOLEAN
+accept_ra - INTEGER
 	Accept Router Advertisements; autoconfigure using them.
 
 	Possible values are:
@@ -1106,7 +1106,7 @@ dad_transmits - INTEGER
 	The amount of Duplicate Address Detection probes to send.
 	Default: 1
 
-forwarding - BOOLEAN
+forwarding - INTEGER
 	Configure interface-specific Host/Router behaviour.
 
 	Note: It is recommended to have the same setting on all

From 85a64889492b45f931ddac87ec09d84aa7347ee1 Mon Sep 17 00:00:00 2001
From: Jonathan Lallinger <jonathan@ogc.us>
Date: Thu, 29 Sep 2011 07:58:41 +0000
Subject: [PATCH 09/15] RDSRDMA: Fix cleanup of rds_iw_mr_pool

In the rds_iw_mr_pool struct the free_pinned field keeps track of
memory pinned by free MRs. While this field is incremented properly
upon allocation, it is never decremented upon unmapping. This would
cause the rds_rdma module to crash the kernel upon unloading, by
triggering the BUG_ON in the rds_iw_destroy_mr_pool function.

This change keeps track of the MRs that become unpinned, so that
free_pinned can be decremented appropriately.

Signed-off-by: Jonathan Lallinger <jonathan@ogc.us>
Signed-off-by: Steve Wise <swise@ogc.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/iw_rdma.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 8b77edbab27228..4e1de171866c7a 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -84,7 +84,8 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
 static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 			struct list_head *unmap_list,
-			struct list_head *kill_list);
+			struct list_head *kill_list,
+			int *unpinned);
 static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 
 static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
@@ -499,7 +500,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(kill_list);
 	unsigned long flags;
-	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	unsigned int nfreed = 0, ncleaned = 0, unpinned = 0, free_goal;
 	int ret = 0;
 
 	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
@@ -524,7 +525,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 	 * will be destroyed by the unmap function.
 	 */
 	if (!list_empty(&unmap_list)) {
-		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
+						     &kill_list, &unpinned);
 		/* If we've been asked to destroy all MRs, move those
 		 * that were simply cleaned to the kill list */
 		if (free_all)
@@ -548,6 +550,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 		spin_unlock_irqrestore(&pool->list_lock, flags);
 	}
 
+	atomic_sub(unpinned, &pool->free_pinned);
 	atomic_sub(ncleaned, &pool->dirty_count);
 	atomic_sub(nfreed, &pool->item_count);
 
@@ -828,7 +831,8 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
 
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 				struct list_head *unmap_list,
-				struct list_head *kill_list)
+				struct list_head *kill_list,
+				int *unpinned)
 {
 	struct rds_iw_mapping *mapping, *next;
 	unsigned int ncleaned = 0;
@@ -855,6 +859,7 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 
 		spin_lock_irqsave(&pool->list_lock, flags);
 		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+			*unpinned += mapping->m_sg.len;
 			list_move(&mapping->m_list, &laundered);
 			ncleaned++;
 		}

From 12d0d0d3a7349daa95dbfd5d7df8146255bc7c67 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Thu, 29 Sep 2011 15:33:47 -0400
Subject: [PATCH 10/15] can bcm: fix incomplete tx_setup fix

The commit aabdcb0b553b9c9547b1a506b34d55a764745870 ("can bcm: fix tx_setup
off-by-one errors") fixed only a part of the original problem reported by
Andre Naujoks. It turned out that the original code needed to be re-ordered
to reduce complexity and to finally fix the reported frame counting issues.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/can/bcm.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/net/can/bcm.c b/net/can/bcm.c
index c9cdb8d78e7088..c84963d2dee695 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -344,6 +344,18 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
 	}
 }
 
+static void bcm_tx_start_timer(struct bcm_op *op)
+{
+	if (op->kt_ival1.tv64 && op->count)
+		hrtimer_start(&op->timer,
+			      ktime_add(ktime_get(), op->kt_ival1),
+			      HRTIMER_MODE_ABS);
+	else if (op->kt_ival2.tv64)
+		hrtimer_start(&op->timer,
+			      ktime_add(ktime_get(), op->kt_ival2),
+			      HRTIMER_MODE_ABS);
+}
+
 static void bcm_tx_timeout_tsklet(unsigned long data)
 {
 	struct bcm_op *op = (struct bcm_op *)data;
@@ -365,23 +377,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
 
 			bcm_send_to_user(op, &msg_head, NULL, 0);
 		}
-
-		/* send (next) frame */
 		bcm_can_tx(op);
-		hrtimer_start(&op->timer,
-			      ktime_add(ktime_get(), op->kt_ival1),
-			      HRTIMER_MODE_ABS);
 
-	} else {
-		if (op->kt_ival2.tv64) {
+	} else if (op->kt_ival2.tv64)
+		bcm_can_tx(op);
 
-			/* send (next) frame */
-			bcm_can_tx(op);
-			hrtimer_start(&op->timer,
-				      ktime_add(ktime_get(), op->kt_ival2),
-				      HRTIMER_MODE_ABS);
-		}
-	}
+	bcm_tx_start_timer(op);
 }
 
 /*
@@ -961,28 +962,21 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 			hrtimer_cancel(&op->timer);
 	}
 
-	if ((op->flags & STARTTIMER) &&
-	    ((op->kt_ival1.tv64 && op->count) || op->kt_ival2.tv64)) {
-
+	if (op->flags & STARTTIMER) {
+		hrtimer_cancel(&op->timer);
 		/* spec: send can_frame when starting timer */
 		op->flags |= TX_ANNOUNCE;
-
-		/* only start timer when having more frames than sent below */
-		if (op->kt_ival1.tv64 && (op->count > 1)) {
-			/* op->count-- is done in bcm_tx_timeout_tsklet */
-			hrtimer_start(&op->timer, op->kt_ival1,
-				      HRTIMER_MODE_REL);
-		} else
-			hrtimer_start(&op->timer, op->kt_ival2,
-				      HRTIMER_MODE_REL);
 	}
 
 	if (op->flags & TX_ANNOUNCE) {
 		bcm_can_tx(op);
-		if (op->kt_ival1.tv64 && (op->count > 0))
+		if (op->count)
 			op->count--;
 	}
 
+	if (op->flags & STARTTIMER)
+		bcm_tx_start_timer(op);
+
 	return msg_head->nframes * CFSIZ + MHSIZ;
 }
 

From a0db2dad0935e798973bb79676e722b82f177206 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andy@greyhouse.net>
Date: Fri, 23 Sep 2011 10:53:34 +0000
Subject: [PATCH 11/15] bonding: properly stop queuing work when requested

During a test where a pair of bonding interfaces using ARP monitoring
were both brought up and torn down (with an rmmod) repeatedly, a panic
in the timer code was noticed.  I tracked this down and determined that
any of the bonding functions that ran as workqueue handlers and requeued
more work might not properly exit when the module was removed.

There was a flag protected by the bond lock called kill_timers that is
set when the interface goes down or the module is removed, but many of
the functions that monitor link status now unlock the bond lock to take
rtnl first.  There is a chance that another CPU running the rmmod could
get the lock and set kill_timers after the first check has passed.

This patch does not allow any function to queue work that will make
itself run unless kill_timers is not set.  I also noticed while doing
this work that bond_resend_igmp_join_requests did not have a check for
kill_timers, so I added the needed call there as well.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Reported-by: Liang Zheng <lzheng@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_3ad.c  |  3 ++-
 drivers/net/bonding/bond_alb.c  |  3 ++-
 drivers/net/bonding/bond_main.c | 13 ++++++++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index a047eb973e3bc6..47b928ed08f8a0 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2168,7 +2168,8 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 	}
 
 re_arm:
-	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
+	if (!bond->kill_timers)
+		queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
 out:
 	read_unlock(&bond->lock);
 }
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 7f8b20a34ee344..d4fbd2e62616cf 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1440,7 +1440,8 @@ void bond_alb_monitor(struct work_struct *work)
 	}
 
 re_arm:
-	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
+	if (!bond->kill_timers)
+		queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
 out:
 	read_unlock(&bond->lock);
 }
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 43f2ea5410884a..6d79b78cfc7559 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -777,6 +777,9 @@ static void bond_resend_igmp_join_requests(struct bonding *bond)
 
 	read_lock(&bond->lock);
 
+	if (bond->kill_timers)
+		goto out;
+
 	/* rejoin all groups on bond device */
 	__bond_resend_igmp_join_requests(bond->dev);
 
@@ -790,9 +793,9 @@ static void bond_resend_igmp_join_requests(struct bonding *bond)
 			__bond_resend_igmp_join_requests(vlan_dev);
 	}
 
-	if (--bond->igmp_retrans > 0)
+	if ((--bond->igmp_retrans > 0) && !bond->kill_timers)
 		queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
-
+out:
 	read_unlock(&bond->lock);
 }
 
@@ -2538,7 +2541,7 @@ void bond_mii_monitor(struct work_struct *work)
 	}
 
 re_arm:
-	if (bond->params.miimon)
+	if (bond->params.miimon && !bond->kill_timers)
 		queue_delayed_work(bond->wq, &bond->mii_work,
 				   msecs_to_jiffies(bond->params.miimon));
 out:
@@ -2886,7 +2889,7 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 	}
 
 re_arm:
-	if (bond->params.arp_interval)
+	if (bond->params.arp_interval && !bond->kill_timers)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
 out:
 	read_unlock(&bond->lock);
@@ -3154,7 +3157,7 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 	bond_ab_arp_probe(bond);
 
 re_arm:
-	if (bond->params.arp_interval)
+	if (bond->params.arp_interval && !bond->kill_timers)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
 out:
 	read_unlock(&bond->lock);

From d0e5d83284dac15c015bb48115b6780f5a6413cd Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 30 Sep 2011 06:37:51 +0000
Subject: [PATCH 12/15] net: xen-netback: correctly restart Tx after a VM
 restore/migrate

If a VM is saved and restored (or migrated) the netback driver will no
longer process any Tx packets from the frontend.  xenvif_up() does not
schedule the processing of any pending Tx requests from the front end
because the carrier is off.  Without this initial kick the frontend
just adds Tx requests to the ring without raising an event (until the
ring is full).

This was caused by 47103041e91794acdbc6165da0ae288d844c820b (net:
xen-netback: convert to hw_features) which reordered the calls to
xenvif_up() and netif_carrier_on() in xenvif_connect().

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/interface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 0ca86f9ec4ed38..182562952c792a 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -327,12 +327,12 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	xenvif_get(vif);
 
 	rtnl_lock();
-	if (netif_running(vif->dev))
-		xenvif_up(vif);
 	if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
 		dev_set_mtu(vif->dev, ETH_DATA_LEN);
 	netdev_update_features(vif->dev);
 	netif_carrier_on(vif->dev);
+	if (netif_running(vif->dev))
+		xenvif_up(vif);
 	rtnl_unlock();
 
 	return 0;

From 7091fbd82cd5686444ffe9935ed6a8190101fe9d Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 30 Sep 2011 10:38:28 +0000
Subject: [PATCH 13/15] make PACKET_STATISTICS getsockopt report consistently
 between ring and non-ring

This is a minor change.

Up until kernel 2.6.32, getsockopt(fd, SOL_PACKET, PACKET_STATISTICS,
...) would return total and dropped packets since its last invocation. The
introduction of socket queue overflow reporting [1] changed drop
rate calculation in the normal packet socket path, but not when using a
packet ring. As a result, the getsockopt now returns different statistics
depending on the reception method used. With a ring, it still returns the
count since the last call, as counts are incremented in tpacket_rcv and
reset in getsockopt. Without a ring, it returns 0 if no drops occurred
since the last getsockopt and the total drops over the lifespan of
the socket otherwise. The culprit is this line in packet_rcv, executed
on a drop:

drop_n_acct:
        po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);

As it shows, the new drop number it taken from the socket drop counter,
which is not reset at getsockopt. I put together a small example
that demonstrates the issue [2]. It runs for 10 seconds and overflows
the queue/ring on every odd second. The reported drop rates are:
ring: 16, 0, 16, 0, 16, ...
non-ring: 0, 15, 0, 30, 0, 46, 0, 60, 0 , 74.

Note how the even ring counts monotonically increase. Because the
getsockopt adds tp_drops to tp_packets, total counts are similarly
reported cumulatively. Long story short, reinstating the original code, as
the below patch does, fixes the issue at the cost of additional per-packet
cycles. Another solution that does not introduce per-packet overhead
is be to keep the current data path, record the value of sk_drops at
getsockopt() at call N in a new field in struct packetsock and subtract
that when reporting at call N+1. I'll be happy to code that, instead,
it's just more messy.

[1] http://patchwork.ozlabs.org/patch/35665/
[2] http://kernel.googlecode.com/files/test-packetsock-getstatistics.c

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c698cec0a44541..fabb4fafa281ce 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -961,7 +961,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 	return 0;
 
 drop_n_acct:
-	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
+	spin_lock(&sk->sk_receive_queue.lock);
+	po->stats.tp_drops++;
+	atomic_inc(&sk->sk_drops);
+	spin_unlock(&sk->sk_receive_queue.lock);
 
 drop_n_restore:
 	if (skb_head != skb->data && skb_shared(skb)) {

From 5f3a11419099d5cc010cfbfc524ca10d8fb81f89 Mon Sep 17 00:00:00 2001
From: Toshiharu Okada <toshiharu-linux@dsn.okisemi.com>
Date: Sun, 25 Sep 2011 21:27:42 +0000
Subject: [PATCH 14/15] pch_gbe: Fixed the issue on which PC was frozen when
 link was downed.

When a link was downed during network use,
there is an issue on which PC freezes.

This patch fixed this issue.

Signed-off-by: Toshiharu Okada <toshiharu-linux@dsn.okisemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pch_gbe/pch_gbe_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c
index 567ff10889be62..5474189d320be9 100644
--- a/drivers/net/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/pch_gbe/pch_gbe_main.c
@@ -2195,7 +2195,7 @@ static int pch_gbe_napi_poll(struct napi_struct *napi, int budget)
 		/* If no Tx and not enough Rx work done,
 		 * exit the polling mode
 		 */
-		if ((work_done < budget) || !netif_running(netdev))
+		if (work_done < budget)
 			poll_end_flag = true;
 	}
 

From 805e969f6151eda7bc1a57e9c737054230acc3cc Mon Sep 17 00:00:00 2001
From: Toshiharu Okada <toshiharu-linux@dsn.okisemi.com>
Date: Sun, 25 Sep 2011 21:27:43 +0000
Subject: [PATCH 15/15] pch_gbe: Fixed the issue on which a network freezes

The pch_gbe driver has an issue which a network stops,
when receiving traffic is high.
In the case, The link down and up are necessary to return a network.

This patch fixed this issue.

Signed-off-by: Toshiharu Okada <toshiharu-linux@dsn.okisemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pch_gbe/pch_gbe_main.c | 56 ++++++++++++++----------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c
index 5474189d320be9..b8b4ba27b0e7ca 100644
--- a/drivers/net/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/pch_gbe/pch_gbe_main.c
@@ -1199,6 +1199,8 @@ static irqreturn_t pch_gbe_intr(int irq, void *data)
 			iowrite32((int_en & ~PCH_GBE_INT_RX_FIFO_ERR),
 				  &hw->reg->INT_EN);
 			pch_gbe_stop_receive(adapter);
+			int_st |= ioread32(&hw->reg->INT_ST);
+			int_st = int_st & ioread32(&hw->reg->INT_EN);
 		}
 	if (int_st & PCH_GBE_INT_RX_DMA_ERR)
 		adapter->stats.intr_rx_dma_err_count++;
@@ -1218,14 +1220,11 @@ static irqreturn_t pch_gbe_intr(int irq, void *data)
 			/* Set Pause packet */
 			pch_gbe_mac_set_pause_packet(hw);
 		}
-		if ((int_en & (PCH_GBE_INT_RX_DMA_CMPLT | PCH_GBE_INT_TX_CMPLT))
-		    == 0) {
-			return IRQ_HANDLED;
-		}
 	}
 
 	/* When request status is Receive interruption */
-	if ((int_st & (PCH_GBE_INT_RX_DMA_CMPLT | PCH_GBE_INT_TX_CMPLT))) {
+	if ((int_st & (PCH_GBE_INT_RX_DMA_CMPLT | PCH_GBE_INT_TX_CMPLT)) ||
+	    (adapter->rx_stop_flag == true)) {
 		if (likely(napi_schedule_prep(&adapter->napi))) {
 			/* Enable only Rx Descriptor empty */
 			atomic_inc(&adapter->irq_sem);
@@ -1385,7 +1384,7 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
 	struct sk_buff *skb;
 	unsigned int i;
 	unsigned int cleaned_count = 0;
-	bool cleaned = false;
+	bool cleaned = true;
 
 	pr_debug("next_to_clean : %d\n", tx_ring->next_to_clean);
 
@@ -1396,7 +1395,6 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
 
 	while ((tx_desc->gbec_status & DSC_INIT16) == 0x0000) {
 		pr_debug("gbec_status:0x%04x\n", tx_desc->gbec_status);
-		cleaned = true;
 		buffer_info = &tx_ring->buffer_info[i];
 		skb = buffer_info->skb;
 
@@ -1439,8 +1437,10 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
 		tx_desc = PCH_GBE_TX_DESC(*tx_ring, i);
 
 		/* weight of a sort for tx, to avoid endless transmit cleanup */
-		if (cleaned_count++ == PCH_GBE_TX_WEIGHT)
+		if (cleaned_count++ == PCH_GBE_TX_WEIGHT) {
+			cleaned = false;
 			break;
+		}
 	}
 	pr_debug("called pch_gbe_unmap_and_free_tx_resource() %d count\n",
 		 cleaned_count);
@@ -2168,7 +2168,6 @@ static int pch_gbe_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct pch_gbe_adapter *adapter =
 	    container_of(napi, struct pch_gbe_adapter, napi);
-	struct net_device *netdev = adapter->netdev;
 	int work_done = 0;
 	bool poll_end_flag = false;
 	bool cleaned = false;
@@ -2176,33 +2175,32 @@ static int pch_gbe_napi_poll(struct napi_struct *napi, int budget)
 
 	pr_debug("budget : %d\n", budget);
 
-	/* Keep link state information with original netdev */
-	if (!netif_carrier_ok(netdev)) {
+	pch_gbe_clean_rx(adapter, adapter->rx_ring, &work_done, budget);
+	cleaned = pch_gbe_clean_tx(adapter, adapter->tx_ring);
+
+	if (!cleaned)
+		work_done = budget;
+	/* If no Tx and not enough Rx work done,
+	 * exit the polling mode
+	 */
+	if (work_done < budget)
 		poll_end_flag = true;
-	} else {
-		pch_gbe_clean_rx(adapter, adapter->rx_ring, &work_done, budget);
+
+	if (poll_end_flag) {
+		napi_complete(napi);
+		if (adapter->rx_stop_flag) {
+			adapter->rx_stop_flag = false;
+			pch_gbe_start_receive(&adapter->hw);
+		}
+		pch_gbe_irq_enable(adapter);
+	} else
 		if (adapter->rx_stop_flag) {
 			adapter->rx_stop_flag = false;
 			pch_gbe_start_receive(&adapter->hw);
 			int_en = ioread32(&adapter->hw.reg->INT_EN);
 			iowrite32((int_en | PCH_GBE_INT_RX_FIFO_ERR),
-					&adapter->hw.reg->INT_EN);
+				&adapter->hw.reg->INT_EN);
 		}
-		cleaned = pch_gbe_clean_tx(adapter, adapter->tx_ring);
-
-		if (cleaned)
-			work_done = budget;
-		/* If no Tx and not enough Rx work done,
-		 * exit the polling mode
-		 */
-		if (work_done < budget)
-			poll_end_flag = true;
-	}
-
-	if (poll_end_flag) {
-		napi_complete(napi);
-		pch_gbe_irq_enable(adapter);
-	}
 
 	pr_debug("poll_end_flag : %d  work_done : %d  budget : %d\n",
 		 poll_end_flag, work_done, budget);