Permalink
Browse files

Merge branch 'for-3.13/core' of git://git.kernel.dk/linux-block

Pull block IO core updates from Jens Axboe:
 "This is the pull request for the core changes in the block layer for
  3.13.  It contains:

   - The new blk-mq request interface.

     This is a new and more scalable queueing model that marries the
     best part of the request based interface we currently have (which
     is fully featured, but scales poorly) and the bio based "interface"
     which the new drivers for high IOPS devices end up using because
     it's much faster than the request based one.

     The bio interface has no block layer support, since it taps into
     the stack much earlier.  This means that drivers end up having to
     implement a lot of functionality on their own, like tagging,
     timeout handling, requeue, etc.  The blk-mq interface provides all
     these.  Some drivers even provide a switch to select bio or rq and
     has code to handle both, since things like merging only works in
     the rq model and hence is faster for some workloads.  This is a
     huge mess.  Conversion of these drivers nets us a substantial code
     reduction.  Initial results on converting SCSI to this model even
     shows an 8x improvement on single queue devices.  So while the
     model was intended to work on the newer multiqueue devices, it has
     substantial improvements for "classic" hardware as well.  This code
     has gone through extensive testing and development, it's now ready
     to go.  A pull request is coming to convert virtio-blk to this
     model will be will be coming as well, with more drivers scheduled
     for 3.14 conversion.

   - Two blktrace fixes from Jan and Chen Gang.

   - A plug merge fix from Alireza Haghdoost.

   - Conversion of __get_cpu_var() from Christoph Lameter.

   - Fix for sector_div() with 64-bit divider from Geert Uytterhoeven.

   - A fix for a race between request completion and the timeout
     handling from Jeff Moyer.  This is what caused the merge conflict
     with blk-mq/core, in case you are looking at that.

   - A dm stacking fix from Mike Snitzer.

   - A code consolidation fix and duplicated code removal from Kent
     Overstreet.

   - A handful of block bug fixes from Mikulas Patocka, fixing a loop
     crash and memory corruption on blk cg.

   - Elevator switch bug fix from Tomoki Sekiyama.

  A heads-up that I had to rebase this branch.  Initially the immutable
  bio_vecs had been queued up for inclusion, but a week later, it became
  clear that it wasn't fully cooked yet.  So the decision was made to
  pull this out and postpone it until 3.14.  It was a straight forward
  rebase, just pruning out the immutable series and the later fixes of
  problems with it.  The rest of the patches applied directly and no
  further changes were made"

* 'for-3.13/core' of git://git.kernel.dk/linux-block: (31 commits)
  block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO
  block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO
  block: Do not call sector_div() with a 64-bit divisor
  kernel: trace: blktrace: remove redundent memcpy() in compat_blk_trace_setup()
  block: Consolidate duplicated bio_trim() implementations
  block: Use rw_copy_check_uvector()
  block: Enable sysfs nomerge control for I/O requests in the plug list
  block: properly stack underlying max_segment_size to DM device
  elevator: acquire q->sysfs_lock in elevator_change()
  elevator: Fix a race in elevator switching and md device initialization
  block: Replace __get_cpu_var uses
  bdi: test bdi_init failure
  block: fix a probe argument to blk_register_region
  loop: fix crash if blk_alloc_queue fails
  blk-core: Fix memory corruption if blkcg_init_queue fails
  block: fix race between request completion and timeout handling
  blktrace: Send BLK_TN_PROCESS events to all running traces
  blk-mq: don't disallow request merges for req->special being set
  blk-mq: mq plug list breakage
  blk-mq: fix for flush deadlock
  ...
  • Loading branch information...
2 parents 2821fe6 + e37459b commit 0910c0bdf7c291a41bc21e40a97389c9d4c1960d @torvalds committed Nov 14, 2013
View
5 block/Makefile
@@ -5,8 +5,9 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
- partition-generic.o partitions/
+ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+ genhd.o scsi_ioctl.o partition-generic.o partitions/
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
View
175 block/blk-core.c
@@ -16,6 +16,7 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
/*
* For the allocated request tables
*/
-static struct kmem_cache *request_cachep;
+struct kmem_cache *request_cachep = NULL;
/*
* For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
-static void drive_stat_acct(struct request *rq, int new_io)
-{
- struct hd_struct *part;
- int rw = rq_data_dir(rq);
- int cpu;
-
- if (!blk_do_io_stat(rq))
- return;
-
- cpu = part_stat_lock();
-
- if (!new_io) {
- part = rq->part;
- part_stat_inc(cpu, part, merges[rw]);
- } else {
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!hd_struct_try_get(part)) {
- /*
- * The partition is already being removed,
- * the request will be accounted on the disk only
- *
- * We take a reference on disk->part0 although that
- * partition will never be deleted, so we can treat
- * it as any other partition.
- */
- part = &rq->rq_disk->part0;
- hd_struct_get(part);
- }
- part_round_stats(cpu, part);
- part_inc_in_flight(part, rw);
- rq->part = part;
- }
-
- part_stat_unlock();
-}
-
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->cmd = rq->__cmd;
rq->cmd_len = BLK_MAX_CDB;
rq->tag = -1;
- rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
{
int bit;
- printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
+ printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
- rq->cmd_flags);
+ (unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
+ if (percpu_counter_init(&q->mq_usage_counter, 0))
+ goto fail_q;
+
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
- goto fail_q;
+ goto fail_c;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+ init_waitqueue_head(&q->mq_freeze_wq);
+
if (blkcg_init_queue(q))
- goto fail_id;
+ goto fail_bdi;
return q;
+fail_bdi:
+ bdi_destroy(&q->backing_dev_info);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_c:
+ percpu_counter_destroy(&q->mq_usage_counter);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
@@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
q->sg_reserved_size = INT_MAX;
+ /* Protect q->elevator from elevator_change */
+ mutex_lock(&q->sysfs_lock);
+
/* init elevator */
- if (elevator_init(q, NULL))
+ if (elevator_init(q, NULL)) {
+ mutex_unlock(&q->sysfs_lock);
return NULL;
+ }
+
+ mutex_unlock(&q->sysfs_lock);
+
return q;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1109,7 +1090,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
goto retry;
}
-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+static struct request *blk_old_get_request(struct request_queue *q, int rw,
+ gfp_t gfp_mask)
{
struct request *rq;
@@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
return rq;
}
+
+struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+{
+ if (q->mq_ops)
+ return blk_mq_alloc_request(q, rw, gfp_mask, false);
+ else
+ return blk_old_get_request(q, rw, gfp_mask);
+}
EXPORT_SYMBOL(blk_get_request);
/**
@@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
static void add_acct_request(struct request_queue *q, struct request *rq,
int where)
{
- drive_stat_acct(rq, 1);
+ blk_account_io_start(rq, true);
__elv_add_request(q, rq, where);
}
@@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
{
if (unlikely(!q))
return;
- if (unlikely(--req->ref_count))
- return;
blk_pm_put_request(req);
@@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
void blk_put_request(struct request *req)
{
- unsigned long flags;
struct request_queue *q = req->q;
- spin_lock_irqsave(q->queue_lock, flags);
- __blk_put_request(q, req);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ if (q->mq_ops)
+ blk_mq_free_request(req);
+ else {
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ __blk_put_request(q, req);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
}
EXPORT_SYMBOL(blk_put_request);
@@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
}
EXPORT_SYMBOL_GPL(blk_add_request_payload);
-static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+ struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
@@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
- drive_stat_acct(req, 0);
+ blk_account_io_start(req, false);
return true;
}
-static bool bio_attempt_front_merge(struct request_queue *q,
- struct request *req, struct bio *bio)
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+ struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
@@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
- drive_stat_acct(req, 0);
+ blk_account_io_start(req, false);
return true;
}
/**
- * attempt_plug_merge - try to merge with %current's plugged list
+ * blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @request_count: out parameter for number of traversed plugged requests
@@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q,
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*/
-static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int *request_count)
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+ unsigned int *request_count)
{
struct blk_plug *plug;
struct request *rq;
bool ret = false;
+ struct list_head *plug_list;
+
+ if (blk_queue_nomerges(q))
+ goto out;
plug = current->plug;
if (!plug)
goto out;
*request_count = 0;
- list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+ if (q->mq_ops)
+ plug_list = &plug->mq_list;
+ else
+ plug_list = &plug->list;
+
+ list_for_each_entry_reverse(rq, plug_list, queuelist) {
int el_ret;
if (rq->q == q)
@@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
- if (attempt_plug_merge(q, bio, &request_count))
+ if (blk_attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
@@ -1560,7 +1562,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
}
}
list_add_tail(&req->queuelist, &plug->list);
- drive_stat_acct(req, 1);
+ blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
@@ -2014,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
const int rw = rq_data_dir(req);
@@ -2028,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
}
}
-static void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req)
{
/*
* Account IO completion. flush_rq isn't accounted as a
@@ -2076,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
}
#endif
+void blk_account_io_start(struct request *rq, bool new_io)
+{
+ struct hd_struct *part;
+ int rw = rq_data_dir(rq);
+ int cpu;
+
+ if (!blk_do_io_stat(rq))
+ return;
+
+ cpu = part_stat_lock();
+
+ if (!new_io) {
+ part = rq->part;
+ part_stat_inc(cpu, part, merges[rw]);
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+ if (!hd_struct_try_get(part)) {
+ /*
+ * The partition is already being removed,
+ * the request will be accounted on the disk only
+ *
+ * We take a reference on disk->part0 although that
+ * partition will never be deleted, so we can treat
+ * it as any other partition.
+ */
+ part = &rq->rq_disk->part0;
+ hd_struct_get(part);
+ }
+ part_round_stats(cpu, part);
+ part_inc_in_flight(part, rw);
+ rq->part = part;
+ }
+
+ part_stat_unlock();
+}
+
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
@@ -2227,6 +2265,7 @@ void blk_start_request(struct request *req)
if (unlikely(blk_bidi_rq(req)))
req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
+ BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
blk_add_timer(req);
}
EXPORT_SYMBOL(blk_start_request);
@@ -2451,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
if (req->cmd_flags & REQ_DONTPREP)
blk_unprep_request(req);
-
blk_account_io_done(req);
if (req->end_io)
@@ -2873,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
+ INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
/*
@@ -2970,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
BUG_ON(plug->magic != PLUG_MAGIC);
flush_plug_callbacks(plug, from_schedule);
+
+ if (!list_empty(&plug->mq_list))
+ blk_mq_flush_plug_list(plug, from_schedule);
+
if (list_empty(&plug->list))
return;
View
14 block/blk-exec.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/sched/sysctl.h>
#include "blk.h"
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
struct completion *waiting = rq->end_io_data;
rq->end_io_data = NULL;
- __blk_put_request(rq->q, rq);
/*
* complete last, if this is a stack request the process (and thus
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
+
+ if (q->mq_ops) {
+ blk_mq_insert_request(q, rq, true);
+ return;
+ }
+
/*
* need to check this before __blk_run_queue(), because rq can
* be freed before that returns.
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
int err = 0;
unsigned long hang_check;
- /*
- * we need an extra reference to the request, so we can look at
- * it after io completion
- */
- rq->ref_count++;
-
if (!rq->sense) {
memset(sense, 0, sizeof(sense));
rq->sense = sense;
View
154 block/blk-flush.c
@@ -69,8 +69,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
+#include <linux/blk-mq.h>
#include "blk.h"
+#include "blk-mq.h"
/* FLUSH/FUA sequences */
enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
/* make @rq a normal request */
rq->cmd_flags &= ~REQ_FLUSH_SEQ;
rq->end_io = rq->flush.saved_end_io;
+
+ blk_clear_rq_complete(rq);
+}
+
+static void mq_flush_data_run(struct work_struct *work)
+{
+ struct request *rq;
+
+ rq = container_of(work, struct request, mq_flush_data);
+
+ memset(&rq->csd, 0, sizeof(rq->csd));
+ blk_mq_run_request(rq, true, false);
+}
+
+static void blk_mq_flush_data_insert(struct request *rq)
+{
+ INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
+ kblockd_schedule_work(rq->q, &rq->mq_flush_data);
}
/**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
* completion and trigger the next step.
*
* CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
- bool queued = false;
+ bool queued = false, kicked;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
- list_add(&rq->queuelist, &q->queue_head);
- queued = true;
+ if (q->mq_ops)
+ blk_mq_flush_data_insert(rq);
+ else {
+ list_add(&rq->queuelist, &q->queue_head);
+ queued = true;
+ }
break;
case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
BUG_ON(!list_empty(&rq->queuelist));
list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
- __blk_end_request_all(rq, error);
+ if (q->mq_ops)
+ blk_mq_end_io(rq, error);
+ else
+ __blk_end_request_all(rq, error);
break;
default:
BUG();
}
- return blk_kick_flush(q) | queued;
+ kicked = blk_kick_flush(q);
+ /* blk_mq_run_flush will run queue */
+ if (q->mq_ops)
+ return queued;
+ return kicked | queued;
}
static void flush_end_io(struct request *flush_rq, int error)
{
struct request_queue *q = flush_rq->q;
- struct list_head *running = &q->flush_queue[q->flush_running_idx];
+ struct list_head *running;
bool queued = false;
struct request *rq, *n;
+ unsigned long flags = 0;
+ if (q->mq_ops) {
+ blk_mq_free_request(flush_rq);
+ spin_lock_irqsave(&q->mq_flush_lock, flags);
+ }
+ running = &q->flush_queue[q->flush_running_idx];
BUG_ON(q->flush_pending_idx == q->flush_running_idx);
/* account completion of the flush request */
q->flush_running_idx ^= 1;
- elv_completed_request(q, flush_rq);
+
+ if (!q->mq_ops)
+ elv_completed_request(q, flush_rq);
/* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
* directly into request_fn may confuse the driver. Always use
* kblockd.
*/
- if (queued || q->flush_queue_delayed)
- blk_run_queue_async(q);
+ if (queued || q->flush_queue_delayed) {
+ if (!q->mq_ops)
+ blk_run_queue_async(q);
+ else
+ /*
+ * This can be optimized to only run queues with requests
+ * queued if necessary.
+ */
+ blk_mq_run_queues(q, true);
+ }
q->flush_queue_delayed = 0;
+ if (q->mq_ops)
+ spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
+static void mq_flush_work(struct work_struct *work)
+{
+ struct request_queue *q;
+ struct request *rq;
+
+ q = container_of(work, struct request_queue, mq_flush_work);
+
+ /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
+ rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
+ __GFP_WAIT|GFP_ATOMIC, true);
+ rq->cmd_type = REQ_TYPE_FS;
+ rq->end_io = flush_end_io;
+
+ blk_mq_run_request(rq, true, false);
+}
+
+/*
+ * We can't directly use q->flush_rq, because it doesn't have tag and is not in
+ * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
+ * so offload the work to workqueue.
+ *
+ * Note: we assume a flush request finished in any hardware queue will flush
+ * the whole disk cache.
+ */
+static void mq_run_flush(struct request_queue *q)
+{
+ kblockd_schedule_work(q, &q->mq_flush_work);
}
/**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
* Please read the comment at the top of this file for more info.
*
* CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
* Issue flush and toggle pending_idx. This makes pending_idx
* different from running_idx, which means flush is in flight.
*/
+ q->flush_pending_idx ^= 1;
+ if (q->mq_ops) {
+ mq_run_flush(q);
+ return true;
+ }
+
blk_rq_init(q, &q->flush_rq);
q->flush_rq.cmd_type = REQ_TYPE_FS;
q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
q->flush_rq.rq_disk = first_rq->rq_disk;
q->flush_rq.end_io = flush_end_io;
- q->flush_pending_idx ^= 1;
list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
return true;
}
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
blk_run_queue_async(q);
}
+static void mq_flush_data_end_io(struct request *rq, int error)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ unsigned long flags;
+
+ ctx = rq->mq_ctx;
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ /*
+ * After populating an empty queue, kick it to avoid stall. Read
+ * the comment in flush_end_io().
+ */
+ spin_lock_irqsave(&q->mq_flush_lock, flags);
+ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+ blk_mq_run_hw_queue(hctx, true);
+ spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
/**
* blk_insert_flush - insert a new FLUSH/FUA request
* @rq: request to insert
*
* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * or __blk_mq_run_hw_queue() to dispatch request.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*
* CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock) in !mq case
*/
void blk_insert_flush(struct request *rq)
{
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
if (!policy) {
- __blk_end_bidi_request(rq, 0, 0, 0);
+ if (q->mq_ops)
+ blk_mq_end_io(rq, 0);
+ else
+ __blk_end_bidi_request(rq, 0, 0, 0);
return;
}
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- list_add_tail(&rq->queuelist, &q->queue_head);
+ if (q->mq_ops) {
+ blk_mq_run_request(rq, false, true);
+ } else
+ list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
INIT_LIST_HEAD(&rq->flush.list);
rq->cmd_flags |= REQ_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+ if (q->mq_ops) {
+ rq->end_io = mq_flush_data_end_io;
+
+ spin_lock_irq(&q->mq_flush_lock);
+ blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+ spin_unlock_irq(&q->mq_flush_lock);
+ return;
+ }
rq->end_io = flush_data_end_io;
blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
+
+void blk_mq_init_flush(struct request_queue *q)
+{
+ spin_lock_init(&q->mq_flush_lock);
+ INIT_WORK(&q->mq_flush_work, mq_flush_work);
+}
View
6 block/blk-iopoll.c
@@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
unsigned long flags;
local_irq_save(flags);
- list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+ list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_restore(flags);
}
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete);
static void blk_iopoll_softirq(struct softirq_action *h)
{
- struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+ struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
int rearm = 0, budget = blk_iopoll_budget;
unsigned long start_time = jiffies;
@@ -201,7 +201,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
- &__get_cpu_var(blk_cpu_iopoll));
+ this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_enable();
}
View
10 block/blk-lib.c
@@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
- sector_t max_discard_sectors;
- sector_t granularity, alignment;
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
struct bio_batch bb;
struct bio *bio;
int ret = 0;
@@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
- alignment = bdev_discard_alignment(bdev) >> 9;
- alignment = sector_div(alignment, granularity);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
/*
* Ensure that max_discard_sectors is of the proper
* granularity, so that requests stay aligned after a split.
*/
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
- sector_div(max_discard_sectors, granularity);
- max_discard_sectors *= granularity;
+ max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
/* Avoid infinite loop below. Being cautious never hurts. */
return -EOPNOTSUPP;
View
17 block/blk-merge.c
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
return ll_new_hw_segment(q, req, bio);
}
+/*
+ * blk-mq uses req->special to carry normal driver per-request payload, it
+ * does not indicate a prepared command that we cannot merge with.
+ */
+static bool req_no_special_merge(struct request *req)
+{
+ struct request_queue *q = req->q;
+
+ return !q->mq_ops && req->special;
+}
+
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
* First check if the either of the requests are re-queued
* requests. Can't merge them if they are.
*/
- if (req->special || next->special)
+ if (req_no_special_merge(req) || req_no_special_merge(next))
return 0;
/*
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk
- || next->special)
+ || req_no_special_merge(next))
return 0;
if (req->cmd_flags & REQ_WRITE_SAME &&
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
/* must be same device and not a special request */
- if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+ if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
return false;
/* only merge integrity protected bio into ditto rq */
View
93 block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+static LIST_HEAD(blk_mq_cpu_notify_list);
+static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
+
+static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long) hcpu;
+ struct blk_mq_cpu_notifier *notify;
+
+ spin_lock(&blk_mq_cpu_notify_lock);
+
+ list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
+ notify->notify(notify->data, action, cpu);
+
+ spin_unlock(&blk_mq_cpu_notify_lock);
+ return NOTIFY_OK;
+}
+
+static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
+ unsigned int cpu)
+{
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ /*
+ * If the CPU goes away, ensure that we run any pending
+ * completions.
+ */
+ struct llist_node *node;
+ struct request *rq;
+
+ local_irq_disable();
+
+ node = llist_del_all(&per_cpu(ipi_lists, cpu));
+ while (node) {
+ struct llist_node *next = node->next;
+
+ rq = llist_entry(node, struct request, ll_list);
+ __blk_mq_end_io(rq, rq->errors);
+ node = next;
+ }
+
+ local_irq_enable();
+ }
+}
+
+static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
+ .notifier_call = blk_mq_main_cpu_notify,
+};
+
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+ BUG_ON(!notifier->notify);
+
+ spin_lock(&blk_mq_cpu_notify_lock);
+ list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
+ spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+ spin_lock(&blk_mq_cpu_notify_lock);
+ list_del(&notifier->list);
+ spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+ void (*fn)(void *, unsigned long, unsigned int),
+ void *data)
+{
+ notifier->notify = fn;
+ notifier->data = data;
+}
+
+static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
+ .notify = blk_mq_cpu_notify,
+};
+
+void __init blk_mq_cpu_init(void)
+{
+ register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
+ blk_mq_register_cpu_notifier(&cpu_notifier);
+}
View
108 block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+
+static void show_map(unsigned int *map, unsigned int nr)
+{
+ int i;
+
+ pr_info("blk-mq: CPU -> queue map\n");
+ for_each_online_cpu(i)
+ pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
+}
+
+static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
+ const int cpu)
+{
+ return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
+}
+
+static int get_first_sibling(unsigned int cpu)
+{
+ unsigned int ret;
+
+ ret = cpumask_first(topology_thread_cpumask(cpu));
+ if (ret < nr_cpu_ids)
+ return ret;
+
+ return cpu;
+}
+
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+{
+ unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
+ cpumask_var_t cpus;
+
+ if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
+ return 1;
+
+ cpumask_clear(cpus);
+ nr_cpus = nr_uniq_cpus = 0;
+ for_each_online_cpu(i) {
+ nr_cpus++;
+ first_sibling = get_first_sibling(i);
+ if (!cpumask_test_cpu(first_sibling, cpus))
+ nr_uniq_cpus++;
+ cpumask_set_cpu(i, cpus);
+ }
+
+ queue = 0;
+ for_each_possible_cpu(i) {
+ if (!cpu_online(i)) {
+ map[i] = 0;
+ continue;
+ }
+
+ /*
+ * Easy case - we have equal or more hardware queues. Or
+ * there are no thread siblings to take into account. Do
+ * 1:1 if enough, or sequential mapping if less.
+ */
+ if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
+ map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
+ queue++;
+ continue;
+ }
+
+ /*
+ * Less then nr_cpus queues, and we have some number of
+ * threads per cores. Map sibling threads to the same
+ * queue.
+ */
+ first_sibling = get_first_sibling(i);
+ if (first_sibling == i) {
+ map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
+ queue);
+ queue++;
+ } else
+ map[i] = map[first_sibling];
+ }
+
+ show_map(map, nr_cpus);
+ free_cpumask_var(cpus);
+ return 0;
+}
+
+unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+{
+ unsigned int *map;
+
+ /* If cpus are offline, map them to first hctx */
+ map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
+ reg->numa_node);
+ if (!map)
+ return NULL;
+
+ if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+ return map;
+
+ kfree(map);
+ return NULL;
+}
View
384 block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static void blk_mq_sysfs_release(struct kobject *kobj)
+{
+}
+
+struct blk_mq_ctx_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_mq_ctx *, char *);
+ ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
+};
+
+struct blk_mq_hw_ctx_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
+ ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
+};
+
+static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
+ char *page)
+{
+ struct blk_mq_ctx_sysfs_entry *entry;
+ struct blk_mq_ctx *ctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+ ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+ q = ctx->queue;
+
+ if (!entry->show)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->show(ctx, page);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct blk_mq_ctx_sysfs_entry *entry;
+ struct blk_mq_ctx *ctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+ ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+ q = ctx->queue;
+
+ if (!entry->store)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->store(ctx, page, length);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_mq_hw_ctx_sysfs_entry *entry;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+ hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+ q = hctx->queue;
+
+ if (!entry->show)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->show(hctx, page);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
+ struct attribute *attr, const char *page,
+ size_t length)
+{
+ struct blk_mq_hw_ctx_sysfs_entry *entry;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q;
+ ssize_t res;
+
+ entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+ hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+ q = hctx->queue;
+
+ if (!entry->store)
+ return -EIO;
+
+ res = -ENOENT;
+ mutex_lock(&q->sysfs_lock);
+ if (!blk_queue_dying(q))
+ res = entry->store(hctx, page, length);
+ mutex_unlock(&q->sysfs_lock);
+ return res;
+}
+
+static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
+{
+ return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
+ ctx->rq_dispatched[0]);
+}
+
+static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
+{
+ return sprintf(page, "%lu\n", ctx->rq_merged);
+}
+
+static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
+{
+ return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
+ ctx->rq_completed[0]);
+}
+
+static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
+{
+ char *start_page = page;
+ struct request *rq;
+
+ page += sprintf(page, "%s:\n", msg);
+
+ list_for_each_entry(rq, list, queuelist)
+ page += sprintf(page, "\t%p\n", rq);
+
+ return page - start_page;
+}
+
+static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
+{
+ ssize_t ret;
+
+ spin_lock(&ctx->lock);
+ ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
+ spin_unlock(&ctx->lock);
+
+ return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
+{
+ return sprintf(page, "%lu\n", hctx->queued);
+}
+
+static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ return sprintf(page, "%lu\n", hctx->run);
+}
+
+static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
+{
+ char *start_page = page;
+ int i;
+
+ page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+ for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
+ unsigned long d = 1U << (i - 1);
+
+ page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+ }
+
+ return page - start_page;
+}
+
+static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
+{
+ ssize_t ret;
+
+ spin_lock(&hctx->lock);
+ ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
+ spin_unlock(&hctx->lock);
+
+ return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ ssize_t ret;
+
+ spin_lock(&hctx->lock);
+ ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
+ spin_unlock(&hctx->lock);
+
+ return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t len)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned long ret;
+ unsigned int i;
+
+ if (kstrtoul(page, 10, &ret)) {
+ pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
+ return -EINVAL;
+ }
+
+ spin_lock(&hctx->lock);
+ if (ret)
+ hctx->flags |= BLK_MQ_F_SHOULD_IPI;
+ else
+ hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
+ spin_unlock(&hctx->lock);
+
+ hctx_for_each_ctx(hctx, ctx, i)
+ ctx->ipi_redirect = !!ret;
+
+ return len;
+}
+
+static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ return blk_mq_tag_sysfs_show(hctx->tags, page);
+}
+
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
+ .attr = {.name = "dispatched", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_dispatched_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
+ .attr = {.name = "merged", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_merged_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
+ .attr = {.name = "completed", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_completed_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
+ .attr = {.name = "rq_list", .mode = S_IRUGO },
+ .show = blk_mq_sysfs_rq_list_show,
+};
+
+static struct attribute *default_ctx_attrs[] = {
+ &blk_mq_sysfs_dispatched.attr,
+ &blk_mq_sysfs_merged.attr,
+ &blk_mq_sysfs_completed.attr,
+ &blk_mq_sysfs_rq_list.attr,
+ NULL,
+};
+
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
+ .attr = {.name = "queued", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_queued_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
+ .attr = {.name = "run", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_run_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
+ .attr = {.name = "dispatched", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_dispatched_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
+ .attr = {.name = "pending", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_rq_list_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
+ .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
+ .show = blk_mq_hw_sysfs_ipi_show,
+ .store = blk_mq_hw_sysfs_ipi_store,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
+ .attr = {.name = "tags", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_tags_show,
+};
+
+static struct attribute *default_hw_ctx_attrs[] = {
+ &blk_mq_hw_sysfs_queued.attr,
+ &blk_mq_hw_sysfs_run.attr,
+ &blk_mq_hw_sysfs_dispatched.attr,
+ &blk_mq_hw_sysfs_pending.attr,
+ &blk_mq_hw_sysfs_ipi.attr,
+ &blk_mq_hw_sysfs_tags.attr,
+ NULL,
+};
+
+static const struct sysfs_ops blk_mq_sysfs_ops = {
+ .show = blk_mq_sysfs_show,
+ .store = blk_mq_sysfs_store,
+};
+
+static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
+ .show = blk_mq_hw_sysfs_show,
+ .store = blk_mq_hw_sysfs_store,
+};
+
+static struct kobj_type blk_mq_ktype = {
+ .sysfs_ops = &blk_mq_sysfs_ops,
+ .release = blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_ctx_ktype = {
+ .sysfs_ops = &blk_mq_sysfs_ops,
+ .default_attrs = default_ctx_attrs,
+ .release = blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_hw_ktype = {
+ .sysfs_ops = &blk_mq_hw_sysfs_ops,
+ .default_attrs = default_hw_ctx_attrs,
+ .release = blk_mq_sysfs_release,
+};
+
+void blk_mq_unregister_disk(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+
+ kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(&q->mq_kobj);
+
+ kobject_put(&disk_to_dev(disk)->kobj);
+}
+
+int blk_mq_register_disk(struct gendisk *disk)
+{
+ struct device *dev = disk_to_dev(disk);
+ struct request_queue *q = disk->queue;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ int ret, i, j;
+
+ kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+ ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+ if (ret < 0)
+ return ret;
+
+ kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+ ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+ if (ret)
+ break;
+
+ if (!hctx->nr_ctx)
+ continue;
+
+ hctx_for_each_ctx(hctx, ctx, j) {
+ kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+ ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+ if (ret)
+ break;
+ }
+ }
+
+ if (ret) {
+ blk_mq_unregister_disk(disk);
+ return ret;
+ }
+
+ return 0;
+}
View
204 block/blk-mq-tag.c
@@ -0,0 +1,204 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu_ida.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+/*
+ * Per tagged queue (tag address space) map
+ */
+struct blk_mq_tags {
+ unsigned int nr_tags;
+ unsigned int nr_reserved_tags;
+ unsigned int nr_batch_move;
+ unsigned int nr_max_cache;
+
+ struct percpu_ida free_tags;
+ struct percpu_ida reserved_tags;
+};
+
+void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
+{
+ int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
+ blk_mq_put_tag(tags, tag);
+}
+
+bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+{
+ return !tags ||
+ percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+{
+ int tag;
+
+ tag = percpu_ida_alloc(&tags->free_tags, gfp);
+ if (tag < 0)
+ return BLK_MQ_TAG_FAIL;
+ return tag + tags->nr_reserved_tags;
+}
+
+static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
+ gfp_t gfp)
+{
+ int tag;
+
+ if (unlikely(!tags->nr_reserved_tags)) {
+ WARN_ON_ONCE(1);
+ return BLK_MQ_TAG_FAIL;
+ }
+
+ tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
+ if (tag < 0)
+ return BLK_MQ_TAG_FAIL;
+ return tag;
+}
+
+unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+{
+ if (!reserved)
+ return __blk_mq_get_tag(tags, gfp);
+
+ return __blk_mq_get_reserved_tag(tags, gfp);
+}
+
+static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+ BUG_ON(tag >= tags->nr_tags);
+
+ percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+}
+
+static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
+ unsigned int tag)
+{
+ BUG_ON(tag >= tags->nr_reserved_tags);
+
+ percpu_ida_free(&tags->reserved_tags, tag);
+}
+
+void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+ if (tag >= tags->nr_reserved_tags)
+ __blk_mq_put_tag(tags, tag);
+ else
+ __blk_mq_put_reserved_tag(tags, tag);
+}
+
+static int __blk_mq_tag_iter(unsigned id, void *data)
+{
+ unsigned long *tag_map = data;
+ __set_bit(id, tag_map);
+ return 0;
+}
+
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
+ void (*fn)(void *, unsigned long *), void *data)
+{
+ unsigned long *tag_map;
+ size_t map_size;
+
+ map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
+ tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
+ if (!tag_map)
+ return;
+
+ percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+ if (tags->nr_reserved_tags)
+ percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
+ tag_map);
+
+ fn(data, tag_map);
+ kfree(tag_map);
+}
+
+struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
+ unsigned int reserved_tags, int node)
+{
+ unsigned int nr_tags, nr_cache;
+ struct blk_mq_tags *tags;
+ int ret;
+
+ if (total_tags > BLK_MQ_TAG_MAX) {
+ pr_err("blk-mq: tag depth too large\n");
+ return NULL;
+ }
+
+ tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
+ if (!tags)
+ return NULL;
+
+ nr_tags = total_tags - reserved_tags;
+ nr_cache = nr_tags / num_possible_cpus();
+
+ if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
+ nr_cache = BLK_MQ_TAG_CACHE_MIN;
+ else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
+ nr_cache = BLK_MQ_TAG_CACHE_MAX;
+
+ tags->nr_tags = total_tags;
+ tags->nr_reserved_tags = reserved_tags;
+ tags->nr_max_cache = nr_cache;
+ tags->nr_batch_move = max(1u, nr_cache / 2);
+
+ ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
+ tags->nr_reserved_tags,
+ tags->nr_max_cache,
+ tags->nr_batch_move);
+ if (ret)
+ goto err_free_tags;
+
+ if (reserved_tags) {
+ /*
+ * With max_cahe and batch set to 1, the allocator fallbacks to
+ * no cached. It's fine reserved tags allocation is slow.
+ */
+ ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
+ 1, 1);
+ if (ret)
+ goto err_reserved_tags;
+ }
+
+ return tags;
+
+err_reserved_tags:
+ percpu_ida_destroy(&tags->free_tags);
+err_free_tags:
+ kfree(tags);
+ return NULL;
+}
+
+void blk_mq_free_tags(struct blk_mq_tags *tags)
+{
+ percpu_ida_destroy(&tags->free_tags);
+ percpu_ida_destroy(&tags->reserved_tags);
+ kfree(tags);
+}
+
+ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
+{
+ char *orig_page = page;
+ int cpu;
+
+ if (!tags)
+ return 0;
+
+ page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
+ " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
+ tags->nr_batch_move, tags->nr_max_cache);
+
+ page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
+ percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
+ percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+
+ for_each_possible_cpu(cpu) {
+ page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu,
+ percpu_ida_free_tags(&tags->free_tags, cpu));
+ }
+
+ return page - orig_page;
+}
View
27 block/blk-mq-tag.h
@@ -0,0 +1,27 @@
+#ifndef INT_BLK_MQ_TAG_H
+#define INT_BLK_MQ_TAG_H
+
+struct blk_mq_tags;
+
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
+
+extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
+extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
+extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
+extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
+extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+
+enum {
+ BLK_MQ_TAG_CACHE_MIN = 1,
+ BLK_MQ_TAG_CACHE_MAX = 64,
+};
+
+enum {
+ BLK_MQ_TAG_FAIL = -1U,
+ BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
+ BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
+};
+
+#endif
View
1,500 block/blk-mq.c
@@ -0,0 +1,1500 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+#include <linux/llist.h>
+#include <linux/list_sort.h>
+#include <linux/cpu.h>
+#include <linux/cache.h>
+#include <linux/sched/sysctl.h>
+#include <linux/delay.h>
+
+#include <trace/events/block.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static DEFINE_MUTEX(all_q_mutex);
+static LIST_HEAD(all_q_list);
+
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
+
+DEFINE_PER_CPU(struct llist_head, ipi_lists);
+
+static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
+ unsigned int cpu)
+{
+ return per_cpu_ptr(q->queue_ctx, cpu);
+}
+
+/*
+ * This assumes per-cpu software queueing queues. They could be per-node
+ * as well, for instance. For now this is hardcoded as-is. Note that we don't
+ * care about preemption, since we know the ctx's are persistent. This does
+ * mean that we can't rely on ctx always matching the currently running CPU.
+ */
+static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
+{
+ return __blk_mq_get_ctx(q, get_cpu());
+}
+
+static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+{
+ put_cpu();
+}
+
+/*
+ * Check if any of the ctx's have pending work in this hardware queue
+ */
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int i;
+
+ for (i = 0; i < hctx->nr_ctx_map; i++)
+ if (hctx->ctx_map[i])
+ return true;
+
+ return false;
+}
+
+/*
+ * Mark this ctx as having pending work in this hardware queue
+ */
+static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ if (!test_bit(ctx->index_hw, hctx->ctx_map))
+ set_bit(ctx->index_hw, hctx->ctx_map);
+}
+
+static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
+ bool reserved)
+{
+ struct request *rq;
+ unsigned int tag;
+
+ tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
+ if (tag != BLK_MQ_TAG_FAIL) {
+ rq = hctx->rqs[tag];
+ rq->tag = tag;
+
+ return rq;
+ }
+
+ return NULL;
+}
+
+static int blk_mq_queue_enter(struct request_queue *q)
+{
+ int ret;
+
+ __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
+ smp_wmb();
+ /* we have problems to freeze the queue if it's initializing */
+ if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+ return 0;
+
+ __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+
+ spin_lock_irq(q->queue_lock);
+ ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
+ !blk_queue_bypass(q), *q->queue_lock);
+ /* inc usage with lock hold to avoid freeze_queue runs here */
+ if (!ret)
+ __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
+ spin_unlock_irq(q->queue_lock);
+
+ return ret;
+}
+
+static void blk_mq_queue_exit(struct request_queue *q)
+{
+ __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+}
+
+/*
+ * Guarantee no request is in use, so we can change any data structure of
+ * the queue afterward.
+ */
+static void blk_mq_freeze_queue(struct request_queue *q)
+{
+ bool drain;
+
+ spin_lock_irq(q->queue_lock);
+ drain = !q->bypass_depth++;
+ queue_flag_set(QUEUE_FLAG_BYPASS, q);
+ spin_unlock_irq(q->queue_lock);
+
+ if (!drain)
+ return;
+
+ while (true) {
+ s64 count;
+
+ spin_lock_irq(q->queue_lock);
+ count = percpu_counter_sum(&q->mq_usage_counter);
+ spin_unlock_irq(q->queue_lock);
+
+ if (count == 0)
+ break;
+ blk_mq_run_queues(q, false);
+ msleep(10);
+ }
+}
+
+static void blk_mq_unfreeze_queue(struct request_queue *q)
+{
+ bool wake = false;
+
+ spin_lock_irq(q->queue_lock);
+ if (!--q->bypass_depth) {
+ queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+ wake = true;
+ }
+ WARN_ON_ONCE(q->bypass_depth < 0);
+ spin_unlock_irq(q->queue_lock);
+ if (wake)
+ wake_up_all(&q->mq_freeze_wq);
+}
+
+bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
+{
+ return blk_mq_has_free_tags(hctx->tags);
+}
+EXPORT_SYMBOL(blk_mq_can_queue);
+
+static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
+ unsigned int rw_flags)
+{
+ rq->mq_ctx = ctx;
+ rq->cmd_flags = rw_flags;
+ ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
+}
+
+static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
+ gfp_t gfp, bool reserved)
+{
+ return blk_mq_alloc_rq(hctx, gfp, reserved);
+}
+
+static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
+ int rw, gfp_t gfp,
+ bool reserved)
+{
+ struct request *rq;
+
+ do {
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+ rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+ if (rq) {
+ blk_mq_rq_ctx_init(ctx, rq, rw);
+ break;
+ } else if (!(gfp & __GFP_WAIT))
+ break;
+
+ blk_mq_put_ctx(ctx);
+ __blk_mq_run_hw_queue(hctx);
+ blk_mq_wait_for_tags(hctx->tags);
+ } while (1);
+
+ return rq;
+}
+
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ gfp_t gfp, bool reserved)
+{
+ struct request *rq;
+
+ if (blk_mq_queue_enter(q))
+ return NULL;
+
+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
+ blk_mq_put_ctx(rq->mq_ctx);
+ return rq;
+}
+
+struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
+ gfp_t gfp)
+{
+ struct request *rq;
+
+ if (blk_mq_queue_enter(q))
+ return NULL;
+
+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
+ blk_mq_put_ctx(rq->mq_ctx);
+ return rq;
+}
+EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
+
+/*
+ * Re-init and set pdu, if we have it
+ */
+static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ blk_rq_init(hctx->queue, rq);
+
+ if (hctx->cmd_size)
+ rq->special = blk_mq_rq_to_pdu(rq);
+}
+
+static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx, struct request *rq)
+{
+ const int tag = rq->tag;
+ struct request_queue *q = rq->q;
+
+ blk_mq_rq_init(hctx, rq);
+ blk_mq_put_tag(hctx->tags, tag);
+
+ blk_mq_queue_exit(q);
+}
+
+void blk_mq_free_request(struct request *rq)
+{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q = rq->q;
+
+ ctx->rq_completed[rq_is_sync(rq)]++;
+
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ __blk_mq_free_request(hctx, ctx, rq);
+}
+
+static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
+{
+ if (error)
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ error = -EIO;
+
+ if (unlikely(rq->cmd_flags & REQ_QUIET))
+ set_bit(BIO_QUIET, &bio->bi_flags);
+
+ /* don't actually finish bio if it's part of flush sequence */
+ if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
+ bio_endio(bio, error);
+}
+
+void blk_mq_complete_request(struct request *rq, int error)
+{
+ struct bio *bio = rq->bio;
+ unsigned int bytes = 0;
+
+ trace_block_rq_complete(rq->q, rq);
+
+ while (bio) {
+ struct bio *next = bio->bi_next;
+
+ bio->bi_next = NULL;
+ bytes += bio->bi_size;
+ blk_mq_bio_endio(rq, bio, error);
+ bio = next;
+ }
+
+ blk_account_io_completion(rq, bytes);
+
+ if (rq->end_io)
+ rq->end_io(rq, error);
+ else
+ blk_mq_free_request(rq);
+
+ blk_account_io_done(rq);
+}
+
+void __blk_mq_end_io(struct request *rq, int error)
+{
+ if (!blk_mark_rq_complete(rq))
+ blk_mq_complete_request(rq, error);
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+
+/*
+ * Called with interrupts disabled.
+ */
+static void ipi_end_io(void *data)
+{
+ struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
+ struct llist_node *entry, *next;
+ struct request *rq;
+
+ entry = llist_del_all(list);
+
+ while (entry) {
+ next = entry->next;
+ rq = llist_entry(entry, struct request, ll_list);
+ __blk_mq_end_io(rq, rq->errors);
+ entry = next;
+ }
+}
+
+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
+ struct request *rq, const int error)
+{
+ struct call_single_data *data = &rq->csd;
+
+ rq->errors = error;
+ rq->ll_list.next = NULL;
+
+ /*
+ * If the list is non-empty, an existing IPI must already
+ * be "in flight". If that is the case, we need not schedule
+ * a new one.
+ */
+ if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
+ data->func = ipi_end_io;
+ data->flags = 0;
+ __smp_call_function_single(ctx->cpu, data, 0);
+ }
+
+ return true;
+}
+#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
+ struct request *rq, const int error)
+{
+ return false;
+}
+#endif
+
+/*
+ * End IO on this request on a multiqueue enabled driver. We'll either do
+ * it directly inline, or punt to a local IPI handler on the matching
+ * remote CPU.
+ */
+void blk_mq_end_io(struct request *rq, int error)
+{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ int cpu;
+
+ if (!ctx->ipi_redirect)
+ return __blk_mq_end_io(rq, error);
+
+ cpu = get_cpu();
+
+ if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
+ !ipi_remote_cpu(ctx, cpu, rq, error))
+ __blk_mq_end_io(rq, error);
+
+ put_cpu();
+}
+EXPORT_SYMBOL(blk_mq_end_io);
+
+static void blk_mq_start_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ trace_block_rq_issue(q, rq);
+
+ /*
+ * Just mark start time and set the started bit. Due to memory
+ * ordering, we know we'll see the correct deadline as long as
+ * REQ_ATOMIC_STARTED is seen.
+ */
+ rq->deadline = jiffies + q->rq_timeout;
+ set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+}
+
+static void blk_mq_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ trace_block_rq_requeue(q, rq);
+ clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+}
+
+struct blk_mq_timeout_data {
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long *next;
+ unsigned int *next_set;
+};
+
+static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
+{
+ struct blk_mq_timeout_data *data = __data;
+ struct blk_mq_hw_ctx *hctx = data->hctx;
+ unsigned int tag;
+
+ /* It may not be in flight yet (this is where
+ * the REQ_ATOMIC_STARTED flag comes in). The requests are
+ * statically allocated, so we know it's always safe to access the
+ * memory associated with a bit offset into ->rqs[].
+ */
+ tag = 0;
+ do {
+ struct request *rq;
+
+ tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
+ if (tag >= hctx->queue_depth)
+ break;
+
+ rq = hctx->rqs[tag++];
+
+ if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+ continue;
+
+ blk_rq_check_expired(rq, data->next, data->next_set);
+ } while (1);
+}
+
+static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
+ unsigned long *next,
+ unsigned int *next_set)
+{
+ struct blk_mq_timeout_data data = {
+ .hctx = hctx,
+ .next = next,
+ .next_set = next_set,
+ };
+
+ /*
+ * Ask the tagging code to iterate busy requests, so we can
+ * check them for timeout.
+ */
+ blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
+}
+
+static void blk_mq_rq_timer(unsigned long data)
+{
+ struct request_queue *q = (struct request_queue *) data;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long next = 0;
+ int i, next_set = 0;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+
+ if (next_set)
+ mod_timer(&q->timeout, round_jiffies_up(next));
+}
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+ struct blk_mq_ctx *ctx, struct bio *bio)
+{
+ struct request *rq;
+ int checked = 8;
+
+ list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+ int el_ret;
+
+ if (!checked--)
+ break;
+
+ if (!blk_rq_merge_ok(rq, bio))
+ continue;
+
+ el_ret = blk_try_merge(rq, bio);
+ if (el_ret == ELEVATOR_BACK_MERGE) {
+ if (bio_attempt_back_merge(q, rq, bio)) {
+ ctx->rq_merged++;
+ return true;
+ }
+ break;
+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+ if (bio_attempt_front_merge(q, rq, bio)) {
+ ctx->rq_merged++;
+ return true;
+ }
+ break;
+ }
+ }
+
+ return false;
+}
+
+void blk_mq_add_timer(struct request *rq)
+{
+ __blk_add_timer(rq, NULL);
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct blk_mq_ctx *ctx;
+ struct request *rq;
+ LIST_HEAD(rq_list);
+ int bit, queued;
+
+ if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * Touch any software queue that has pending entries.
+ */
+ for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
+ clear_bit(bit, hctx->ctx_map);
+ ctx = hctx->ctxs[bit];
+ BUG_ON(bit != ctx->index_hw);
+
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(&ctx->rq_list, &rq_list);
+ spin_unlock(&ctx->lock);
+ }
+
+ /*
+ * If we have previous entries on our dispatch list, grab them
+ * and stuff them at the front for more fair dispatch.
+ */
+ if (!list_empty_careful(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ /*
+ * Delete and return all entries from our dispatch list
+ */
+ queued = 0;
+
+ /*
+ * Now process all the entries, sending them to the driver.
+ */
+ while (!list_empty(&rq_list)) {
+ int ret;
+
+ rq = list_first_entry(&rq_list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ blk_mq_start_request(rq);
+
+ /*
+ * Last request in the series. Flag it as such, this
+ * enables drivers to know when IO should be kicked off,
+ * if they don't do it on a per-request basis.
+ *
+ * Note: the flag isn't the only condition drivers
+ * should do kick off. If drive is busy, the last
+ * request might not have the bit set.
+ */
+ if (list_empty(&rq_list))
+ rq->cmd_flags |= REQ_END;
+
+ ret = q->mq_ops->queue_rq(hctx, rq);
+ switch (ret) {
+ case BLK_MQ_RQ_QUEUE_OK:
+ queued++;
+ continue;
+ case BLK_MQ_RQ_QUEUE_BUSY:
+ /*
+ * FIXME: we should have a mechanism to stop the queue
+ * like blk_stop_queue, otherwise we will waste cpu
+ * time
+ */
+ list_add(&rq->queuelist, &rq_list);
+ blk_mq_requeue_request(rq);
+ break;
+ default:
+ pr_err("blk-mq: bad return on queue: %d\n", ret);
+ rq->errors = -EIO;
+ case BLK_MQ_RQ_QUEUE_ERROR:
+ blk_mq_end_io(rq, rq->errors);
+ break;
+ }
+
+ if (ret == BLK_MQ_RQ_QUEUE_BUSY)
+ break;
+ }
+
+ if (!queued)
+ hctx->dispatched[0]++;
+ else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
+ hctx->dispatched[ilog2(queued) + 1]++;
+
+ /*