* 以下で, 多数の32 bit整数が隙間なく, 128M個, 昇順に整列されたファイルを作る (512MB)

In [1]:
#!/usr/bin/python3
import os
import sys
import getpass
import numpy as np

def mk_sorted_file(filename, size):
    assert(size <= 128 * 1024 * 1024), "don't make it > 512MB"
    rg = np.random.RandomState()
    rg.seed(1234)
    gap = 1024 * 1024 * 1024 / size
    a = rg.randint(0, gap, size=size, dtype=np.uint32)
    s = 0
    for i in range(size):
        s += a[i]
        a[i] = s
    dirname = os.path.dirname(filename)
    if dirname == "":
        dirname = "."
    os.makedirs(dirname, exist_ok=True)
    with open(filename, "wb") as wp:
        a.tofile(wp)

def main():
    filename = sys.argv[1]
    sz = int(sys.argv[2])
    mk_sorted_file(filename, sz)
        
if sys.argv[0].endswith("mk_sorted_file.py"):
    main()


* 多少(20秒くらい)かかるが気長に待つ

In [2]:
n = 128 * 1024 * 1024
mk_sorted_file("sorted.bin", n)

KeyboardInterrupt: 

# <font color="green"> Problem 1 :  mmapの有効な利用</font>

In [None]:
%%writefile mmap_bsearch_ans.c

#include <assert.h>
#include <err.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

int compare_int(const void * a_, const void * b_) {
  const int * a = a_;
  const int * b = b_;
  return *a - *b;
}

int main(int argc, char ** argv) {
  int i = 1;
  char * filename = (argc > i ? argv[i] : "sorted.bin"); i++;
  int key         = (argc > i ? atoi(argv[i]) : 100); i++;
  /* ファイルを開く */
  int fd = open(filename, O_RDONLY);
  if (fd == -1) err(1, "open");
  /* ファイルのサイズ */
  struct stat sb[1];
  if (fstat(fd, sb) == -1) err(1, "fstat");
  /* mmap (読み出しのみ) */
  long sz = sb->st_size;
  int * a = mmap(0, sz, PROT_READ, MAP_PRIVATE, fd, 0);
  if (a == MAP_FAILED) err(1, "mmap");
  const long n = sz / sizeof(int);
  int * found = bsearch(&key, a, n, sizeof(int), compare_int);
  if (found) {
    printf("%d found at %ld-th element\n", key, found - a);
  } else {
    printf("%d not found\n", key);
  }
  if (munmap(a, sz) == -1) err(1, "mumap");
  if (close(fd) == -1) err(1, "close");
  return 0;
}

In [None]:
gcc -o mmap_bsearch_ans -Wall mmap_bsearch_ans.c

In [None]:
data=sorted.bin
if /usr/bin/time ./mmap_bsearch_ans ${data} 26127088  | grep "26127088 found at 7466042-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 173640831 | grep "173640831 found at 49610854-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 33502494  | grep "33502494 found at 9572459-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 413005234 | grep "413005234 found at 118006295-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 261784903 | grep "261784903 found at 74798397-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 88660114  | grep "88660114 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 347238073 | grep "347238073 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 417070716 | grep "417070716 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 389200408 | grep "389200408 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./mmap_bsearch_ans ${data} 225789696 | grep "225789696 not found" ; then echo OK ; else echo NG ; fi

# <font color="green"> Problem 2 :  readとの比較</font>

In [None]:
%%writefile read_bsearch_ans.c

#include <assert.h>
#include <err.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

int compare_int(const void * a_, const void * b_) {
  const int * a = a_;
  const int * b = b_;
  return *a - *b;
}

/* szバイトぴったり読む. その前にEOFになったらエラー */
void read_sz(int fd, void * buf, size_t sz) {
  size_t rd = 0;
  while (rd < sz) {
    ssize_t x = read(fd, buf + rd, sz - rd);
    if (x == -1) err(1, "read");
    assert(x > 0);
    rd += x;
  }
  assert(rd == sz);
}

int main(int argc, char ** argv) {
  int i = 1;
  char * filename = (argc > i ? argv[i] : "sorted.bin"); i++;
  int key         = (argc > i ? atoi(argv[i]) : 100); i++;
  /* ファイルを開く */
  int fd = open(filename, O_RDONLY);
  if (fd == -1) err(1, "open");
  /* ファイルのサイズ */
  struct stat sb[1];
  if (fstat(fd, sb) == -1) err(1, "fstat");
  /* mmap (読み出しのみ) */
  long sz = sb->st_size;
  int * a = malloc(sz);
  if (!a) err(1, "malloc");
  read_sz(fd, a, sz);
  const long n = sz / sizeof(int);
  int * found = bsearch(&key, a, n, sizeof(int), compare_int);
  if (found) {
    printf("%d found at %ld-th element\n", key, found - a);
  } else {
    printf("%d not found\n", key);
  }
  free(a);
  if (close(fd) == -1) err(1, "close");
  return 0;
}

In [None]:
gcc -o read_bsearch_ans -Wall -O3 read_bsearch_ans.c

In [None]:
data=sorted.bin
if /usr/bin/time ./read_bsearch_ans ${data} 26127088  | grep "26127088 found at 7466042-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 173640831 | grep "173640831 found at 49610854-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 33502494  | grep "33502494 found at 9572459-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 413005234 | grep "413005234 found at 118006295-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 261784903 | grep "261784903 found at 74798397-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 88660114  | grep "88660114 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 347238073 | grep "347238073 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 417070716 | grep "417070716 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 389200408 | grep "389200408 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./read_bsearch_ans ${data} 225789696 | grep "225789696 not found" ; then echo OK ; else echo NG ; fi

In [None]:
%%writefile linear_search_ans.c

#include <assert.h>
#include <err.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

int compare_int(const void * a_, const void * b_) {
  const int * a = a_;
  const int * b = b_;
  return *a - *b;
}

/* szバイトぴったり読む. その前にEOFになったらエラー */
void read_sz(int fd, void * buf, size_t sz) {
  size_t rd = 0;
  while (rd < sz) {
    ssize_t x = read(fd, buf + rd, sz - rd);
    if (x == -1) err(1, "read");
    assert(x > 0);
    rd += x;
  }
  assert(rd == sz);
}

int main(int argc, char ** argv) {
  int i = 1;
  char * filename = (argc > i ? argv[i] : "sorted.bin"); i++;
  int key         = (argc > i ? atoi(argv[i]) : 100); i++;
  /* ファイルを開く */
  int fd = open(filename, O_RDONLY);
  if (fd == -1) err(1, "open");
  /* ファイルのサイズ */
  struct stat sb[1];
  if (fstat(fd, sb) == -1) err(1, "fstat");
  /* mmap (読み出しのみ) */
  long sz = sb->st_size;
  const long n = sz / sizeof(int);
  const long buf_sz = 1024;
  int a[buf_sz];
  int done = 0;
  for (long i = 0; i < n && !done; i += buf_sz) {
    long m = (buf_sz < n - i ? buf_sz : n - i);
    read_sz(fd, a, m * sizeof(int));
    for (long j = 0; j < m; j++) {
      if (a[j] == key) {
        done = 1;
        printf("%d found at %ld-th element\n", key, i + j);
        break;
      } else if (a[j] > key) {
        done = 1;
        printf("%d not found\n", key);
        break;
      }
    }
  }
  if (!done) {
    printf("%d not found\n", key);
  }
  if (close(fd) == -1) err(1, "close");
  return 0;
}

In [None]:
gcc -o linear_search_ans -Wall -O3 linear_search_ans.c

In [None]:
data=sorted.bin
if /usr/bin/time ./linear_search_ans ${data} 26127088  | grep "26127088 found at 7466042-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 173640831 | grep "173640831 found at 49610854-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 33502494  | grep "33502494 found at 9572459-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 413005234 | grep "413005234 found at 118006295-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 261784903 | grep "261784903 found at 74798397-th element" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 88660114  | grep "88660114 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 347238073 | grep "347238073 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 417070716 | grep "417070716 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 389200408 | grep "389200408 not found" ; then echo OK ; else echo NG ; fi
if /usr/bin/time ./linear_search_ans ${data} 225789696 | grep "225789696 not found" ; then echo OK ; else echo NG ; fi