Skip to content

Commit 9048162

Browse files
dgibsontorvalds
authored andcommitted
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry <abarry@cray.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent a1d776e commit 9048162

File tree

3 files changed

+139
-64
lines changed

3 files changed

+139
-64
lines changed

Diff for: fs/hugetlbfs/inode.c

+21-33
Original file line numberDiff line numberDiff line change
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
626626
spin_lock(&sbinfo->stat_lock);
627627
/* If no limits set, just report 0 for max/free/used
628628
* blocks, like simple_statfs() */
629-
if (sbinfo->max_blocks >= 0) {
630-
buf->f_blocks = sbinfo->max_blocks;
631-
buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
629+
if (sbinfo->spool) {
630+
long free_pages;
631+
632+
spin_lock(&sbinfo->spool->lock);
633+
buf->f_blocks = sbinfo->spool->max_hpages;
634+
free_pages = sbinfo->spool->max_hpages
635+
- sbinfo->spool->used_hpages;
636+
buf->f_bavail = buf->f_bfree = free_pages;
637+
spin_unlock(&sbinfo->spool->lock);
632638
buf->f_files = sbinfo->max_inodes;
633639
buf->f_ffree = sbinfo->free_inodes;
634640
}
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
644650

645651
if (sbi) {
646652
sb->s_fs_info = NULL;
653+
654+
if (sbi->spool)
655+
hugepage_put_subpool(sbi->spool);
656+
647657
kfree(sbi);
648658
}
649659
}
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
874884
sb->s_fs_info = sbinfo;
875885
sbinfo->hstate = config.hstate;
876886
spin_lock_init(&sbinfo->stat_lock);
877-
sbinfo->max_blocks = config.nr_blocks;
878-
sbinfo->free_blocks = config.nr_blocks;
879887
sbinfo->max_inodes = config.nr_inodes;
880888
sbinfo->free_inodes = config.nr_inodes;
889+
sbinfo->spool = NULL;
890+
if (config.nr_blocks != -1) {
891+
sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
892+
if (!sbinfo->spool)
893+
goto out_free;
894+
}
881895
sb->s_maxbytes = MAX_LFS_FILESIZE;
882896
sb->s_blocksize = huge_page_size(config.hstate);
883897
sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
896910
sb->s_root = root;
897911
return 0;
898912
out_free:
913+
if (sbinfo->spool)
914+
kfree(sbinfo->spool);
899915
kfree(sbinfo);
900916
return -ENOMEM;
901917
}
902918

903-
int hugetlb_get_quota(struct address_space *mapping, long delta)
904-
{
905-
int ret = 0;
906-
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
907-
908-
if (sbinfo->free_blocks > -1) {
909-
spin_lock(&sbinfo->stat_lock);
910-
if (sbinfo->free_blocks - delta >= 0)
911-
sbinfo->free_blocks -= delta;
912-
else
913-
ret = -ENOMEM;
914-
spin_unlock(&sbinfo->stat_lock);
915-
}
916-
917-
return ret;
918-
}
919-
920-
void hugetlb_put_quota(struct address_space *mapping, long delta)
921-
{
922-
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
923-
924-
if (sbinfo->free_blocks > -1) {
925-
spin_lock(&sbinfo->stat_lock);
926-
sbinfo->free_blocks += delta;
927-
spin_unlock(&sbinfo->stat_lock);
928-
}
929-
}
930-
931919
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
932920
int flags, const char *dev_name, void *data)
933921
{

Diff for: include/linux/hugetlb.h

+10-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ struct user_struct;
1414
#include <linux/shm.h>
1515
#include <asm/tlbflush.h>
1616

17+
struct hugepage_subpool {
18+
spinlock_t lock;
19+
long count;
20+
long max_hpages, used_hpages;
21+
};
22+
23+
struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
24+
void hugepage_put_subpool(struct hugepage_subpool *spool);
25+
1726
int PageHuge(struct page *page);
1827

1928
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -129,12 +138,11 @@ enum {
129138

130139
#ifdef CONFIG_HUGETLBFS
131140
struct hugetlbfs_sb_info {
132-
long max_blocks; /* blocks allowed */
133-
long free_blocks; /* blocks free */
134141
long max_inodes; /* inodes allowed */
135142
long free_inodes; /* inodes free */
136143
spinlock_t stat_lock;
137144
struct hstate *hstate;
145+
struct hugepage_subpool *spool;
138146
};
139147

140148
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations;
146154
extern const struct vm_operations_struct hugetlb_vm_ops;
147155
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
148156
struct user_struct **user, int creat_flags);
149-
int hugetlb_get_quota(struct address_space *mapping, long delta);
150-
void hugetlb_put_quota(struct address_space *mapping, long delta);
151157

152158
static inline int is_file_hugepages(struct file *file)
153159
{

Diff for: mm/hugetlb.c

+108-27
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
5353
*/
5454
static DEFINE_SPINLOCK(hugetlb_lock);
5555

56+
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57+
{
58+
bool free = (spool->count == 0) && (spool->used_hpages == 0);
59+
60+
spin_unlock(&spool->lock);
61+
62+
/* If no pages are used, and no other handles to the subpool
63+
* remain, free the subpool the subpool remain */
64+
if (free)
65+
kfree(spool);
66+
}
67+
68+
struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69+
{
70+
struct hugepage_subpool *spool;
71+
72+
spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73+
if (!spool)
74+
return NULL;
75+
76+
spin_lock_init(&spool->lock);
77+
spool->count = 1;
78+
spool->max_hpages = nr_blocks;
79+
spool->used_hpages = 0;
80+
81+
return spool;
82+
}
83+
84+
void hugepage_put_subpool(struct hugepage_subpool *spool)
85+
{
86+
spin_lock(&spool->lock);
87+
BUG_ON(!spool->count);
88+
spool->count--;
89+
unlock_or_release_subpool(spool);
90+
}
91+
92+
static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93+
long delta)
94+
{
95+
int ret = 0;
96+
97+
if (!spool)
98+
return 0;
99+
100+
spin_lock(&spool->lock);
101+
if ((spool->used_hpages + delta) <= spool->max_hpages) {
102+
spool->used_hpages += delta;
103+
} else {
104+
ret = -ENOMEM;
105+
}
106+
spin_unlock(&spool->lock);
107+
108+
return ret;
109+
}
110+
111+
static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112+
long delta)
113+
{
114+
if (!spool)
115+
return;
116+
117+
spin_lock(&spool->lock);
118+
spool->used_hpages -= delta;
119+
/* If hugetlbfs_put_super couldn't free spool due to
120+
* an outstanding quota reference, free it now. */
121+
unlock_or_release_subpool(spool);
122+
}
123+
124+
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125+
{
126+
return HUGETLBFS_SB(inode->i_sb)->spool;
127+
}
128+
129+
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130+
{
131+
return subpool_inode(vma->vm_file->f_dentry->d_inode);
132+
}
133+
56134
/*
57135
* Region tracking -- allows tracking of reservations and instantiated pages
58136
* across the pages in a mapping.
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
540618
*/
541619
struct hstate *h = page_hstate(page);
542620
int nid = page_to_nid(page);
543-
struct address_space *mapping;
621+
struct hugepage_subpool *spool =
622+
(struct hugepage_subpool *)page_private(page);
544623

545-
mapping = (struct address_space *) page_private(page);
546624
set_page_private(page, 0);
547625
page->mapping = NULL;
548626
BUG_ON(page_count(page));
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
558636
enqueue_huge_page(h, page);
559637
}
560638
spin_unlock(&hugetlb_lock);
561-
if (mapping)
562-
hugetlb_put_quota(mapping, 1);
639+
hugepage_subpool_put_pages(spool, 1);
563640
}
564641

565642
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
9771054
/*
9781055
* Determine if the huge page at addr within the vma has an associated
9791056
* reservation. Where it does not we will need to logically increase
980-
* reservation and actually increase quota before an allocation can occur.
981-
* Where any new reservation would be required the reservation change is
982-
* prepared, but not committed. Once the page has been quota'd allocated
983-
* an instantiated the change should be committed via vma_commit_reservation.
984-
* No action is required on failure.
1057+
* reservation and actually increase subpool usage before an allocation
1058+
* can occur. Where any new reservation would be required the
1059+
* reservation change is prepared, but not committed. Once the page
1060+
* has been allocated from the subpool and instantiated the change should
1061+
* be committed via vma_commit_reservation. No action is required on
1062+
* failure.
9851063
*/
9861064
static long vma_needs_reservation(struct hstate *h,
9871065
struct vm_area_struct *vma, unsigned long addr)
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
10301108
static struct page *alloc_huge_page(struct vm_area_struct *vma,
10311109
unsigned long addr, int avoid_reserve)
10321110
{
1111+
struct hugepage_subpool *spool = subpool_vma(vma);
10331112
struct hstate *h = hstate_vma(vma);
10341113
struct page *page;
1035-
struct address_space *mapping = vma->vm_file->f_mapping;
1036-
struct inode *inode = mapping->host;
10371114
long chg;
10381115

10391116
/*
1040-
* Processes that did not create the mapping will have no reserves and
1041-
* will not have accounted against quota. Check that the quota can be
1042-
* made before satisfying the allocation
1043-
* MAP_NORESERVE mappings may also need pages and quota allocated
1044-
* if no reserve mapping overlaps.
1117+
* Processes that did not create the mapping will have no
1118+
* reserves and will not have accounted against subpool
1119+
* limit. Check that the subpool limit can be made before
1120+
* satisfying the allocation MAP_NORESERVE mappings may also
1121+
* need pages and subpool limit allocated allocated if no reserve
1122+
* mapping overlaps.
10451123
*/
10461124
chg = vma_needs_reservation(h, vma, addr);
10471125
if (chg < 0)
10481126
return ERR_PTR(-VM_FAULT_OOM);
10491127
if (chg)
1050-
if (hugetlb_get_quota(inode->i_mapping, chg))
1128+
if (hugepage_subpool_get_pages(spool, chg))
10511129
return ERR_PTR(-VM_FAULT_SIGBUS);
10521130

10531131
spin_lock(&hugetlb_lock);
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
10571135
if (!page) {
10581136
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
10591137
if (!page) {
1060-
hugetlb_put_quota(inode->i_mapping, chg);
1138+
hugepage_subpool_put_pages(spool, chg);
10611139
return ERR_PTR(-VM_FAULT_SIGBUS);
10621140
}
10631141
}
10641142

1065-
set_page_private(page, (unsigned long) mapping);
1143+
set_page_private(page, (unsigned long)spool);
10661144

10671145
vma_commit_reservation(h, vma, addr);
10681146

@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
20832161
{
20842162
struct hstate *h = hstate_vma(vma);
20852163
struct resv_map *reservations = vma_resv_map(vma);
2164+
struct hugepage_subpool *spool = subpool_vma(vma);
20862165
unsigned long reserve;
20872166
unsigned long start;
20882167
unsigned long end;
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
20982177

20992178
if (reserve) {
21002179
hugetlb_acct_memory(h, -reserve);
2101-
hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
2180+
hugepage_subpool_put_pages(spool, reserve);
21022181
}
21032182
}
21042183
}
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
23312410
*/
23322411
address = address & huge_page_mask(h);
23332412
pgoff = vma_hugecache_offset(h, vma, address);
2334-
mapping = (struct address_space *)page_private(page);
2413+
mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
23352414

23362415
/*
23372416
* Take the mapping lock for the duration of the table walk. As
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
28842963
{
28852964
long ret, chg;
28862965
struct hstate *h = hstate_inode(inode);
2966+
struct hugepage_subpool *spool = subpool_inode(inode);
28872967

28882968
/*
28892969
* Only apply hugepage reservation if asked. At fault time, an
28902970
* attempt will be made for VM_NORESERVE to allocate a page
2891-
* and filesystem quota without using reserves
2971+
* without using reserves
28922972
*/
28932973
if (vm_flags & VM_NORESERVE)
28942974
return 0;
@@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
29152995
if (chg < 0)
29162996
return chg;
29172997

2918-
/* There must be enough filesystem quota for the mapping */
2919-
if (hugetlb_get_quota(inode->i_mapping, chg))
2998+
/* There must be enough pages in the subpool for the mapping */
2999+
if (hugepage_subpool_get_pages(spool, chg))
29203000
return -ENOSPC;
29213001

29223002
/*
29233003
* Check enough hugepages are available for the reservation.
2924-
* Hand back the quota if there are not
3004+
* Hand the pages back to the subpool if there are not
29253005
*/
29263006
ret = hugetlb_acct_memory(h, chg);
29273007
if (ret < 0) {
2928-
hugetlb_put_quota(inode->i_mapping, chg);
3008+
hugepage_subpool_put_pages(spool, chg);
29293009
return ret;
29303010
}
29313011

@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
29493029
{
29503030
struct hstate *h = hstate_inode(inode);
29513031
long chg = region_truncate(&inode->i_mapping->private_list, offset);
3032+
struct hugepage_subpool *spool = subpool_inode(inode);
29523033

29533034
spin_lock(&inode->i_lock);
29543035
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
29553036
spin_unlock(&inode->i_lock);
29563037

2957-
hugetlb_put_quota(inode->i_mapping, (chg - freed));
3038+
hugepage_subpool_put_pages(spool, (chg - freed));
29583039
hugetlb_acct_memory(h, -(chg - freed));
29593040
}
29603041

0 commit comments

Comments
 (0)