## Today's exercises

1. Implement deletion (to submit next monday)
2. Think about concurrency issues inour little database

### Doing it on disk

- in the hashmap(dict) in memory, store a file offset instead
- this file is an append only file.
- if you update, simply append a new entry and change the offset in the hashmap
- this is what bitcask in Riak does
- break the file into segments. Each segment, once written, the kv pairs are never changed. maintain a hashmap per segment and search these in order
- run compaction to throw away dupes from segments and merge them; delete old files
- deletion is done by writing a tombstone record
- only one writer thread. 
- bitcask will store hashmap snapshots

![](https://dl.dropboxusercontent.com/u/75194/riak1.png)

![](https://dl.dropboxusercontent.com/u/75194/riak2.png)

(from riak bitcask intro at http://basho.com/wp-content/uploads/2015/05/bitcask-intro.pdf )

In [75]:
import os.path
import sys
class Database():
    
    def __init__(self, file):
        self.tombstoneValue = str(sys.maxsize)
        self.file = file
        self.byteorder=sys.byteorder
        if not os.path.exists(file):
            self.fd = open(file, "xb+", buffering=0)
            self.index={}
        else:
            self.fd = open(file, "r+b", buffering=0)
            with open(file+".idx") as fdi:
                items = [l.strip().split(':') for l in fdi.readlines()]
                self.index = {k:int(v) for k,v in items}
        self.readptr = self.fd.tell()
        self.fd.seek(0,2)
        self.writeptr = self.fd.tell()
        
        
    def set(self, x, v):
        if not isinstance(x, str):
            raise ValueError("Key must be a string")
        bin_x = x.encode('utf-8')
        sz_x=len(bin_x).to_bytes(1, byteorder=self.byteorder)
        if not isinstance(v, str):
            raise ValueError("Value must be a string")
        bin_v = v.encode('utf-8')
        sz_v=len(bin_v).to_bytes(1, byteorder=self.byteorder)
        try:
            self.index[x]=self.writeptr
            self.fd.seek(self.writeptr)
            print("currently", self.fd.tell())
            self.fd.write(sz_x+sz_v+bin_x+bin_v)
        except:
            del self.index[x]
        else:
            self.writeptr=self.fd.tell()
            
    def get(self, x):
        try:
            offset = self.index[x]
        except:
            raise ValueError("{} is not in index".format(x))
        bin_x = x.encode('utf-8')
        print("offset is", offset)
        self.readptr=offset
        self.fd.seek(self.readptr)
        sz_k = int.from_bytes(self.fd.read(1), byteorder=self.byteorder)
        sz_v = int.from_bytes(self.fd.read(1), byteorder=self.byteorder)
        self.fd.seek(sz_k,1)
        readit=self.fd.read(sz_v).decode('utf-8')
        print("now", self.fd.tell())
        
        #If the value is the tombstone value, the KV has been deleted
        if readit == self.tombstoneValue:
            raise ValueError("Key not Found")
        
        return readit

    def delete(self, x):
        try:
            del self.index[x]
            self.set(x,self.tombstoneValue)            
        except:
            raise ValueError("Key not found")
    
    def close(self):
        fdi=open(self.file+".idx","w")
        fdi.write("\n".join([k+":"+str(v) for k,v in self.index.items()]))
        fdi.close()
        self.fd.close()
        
    def __del__(self):
        self.fd.close()

In [76]:
!rm /tmp/test.db

In [77]:
db = Database("/tmp/test.db")

In [78]:
print(db.index)

{}


In [79]:
db.set("rahul", "aged")
db.delete("rahul")
db.get("rahul")

currently 0
currently 11
offset is 11
now 37


ValueError: Key not Found

In [80]:
db.set("rahul", "aged")
db.set("pavlos", "aged")
db.set("kobe", "stillyoung")

currently 37
currently 48
currently 60


In [81]:
print(db.index)

{'pavlos': 48, 'kobe': 60, 'rahul': 37}


In [82]:
db.get("pavlos")

offset is 48
now 60


'aged'

In [83]:
db.set("rahul","young")

currently 76


In [84]:
print(db.index)

{'pavlos': 48, 'kobe': 60, 'rahul': 76}


In [85]:
db.get("kobe")

offset is 60
now 76


'stillyoung'

In [86]:
db.get("rahul")

offset is 76
now 88


'young'

In [87]:
db.get("pavlos")

offset is 48
now 60


'aged'

In [88]:
db.index

{'kobe': 60, 'pavlos': 48, 'rahul': 76}

In [89]:
db.set("kobe", "retired")

currently 88


In [90]:
db.index

{'kobe': 88, 'pavlos': 48, 'rahul': 76}

In [91]:
print(db.get("rahul"))
print(db.get("pavlos"))
print(db.get("kobe"))

offset is 76
now 88
young
offset is 48
now 60
aged
offset is 88
now 101
retired


In [92]:
db.set("obama","president")
db.index

currently 101


{'kobe': 88, 'obama': 101, 'pavlos': 48, 'rahul': 76}

In [93]:
print(db.get("rahul"))
print(db.get("pavlos"))
print(db.get("kobe"))
print(db.get("obama"))

offset is 76
now 88
young
offset is 48
now 60
aged
offset is 88
now 101
retired
offset is 101
now 117
president


In [94]:
db.close()

In [95]:
db=Database("/tmp/test.db")
print(db.get("rahul"))
print(db.get("pavlos"))
print(db.get("kobe"))
print(db.get("obama"))

offset is 76
now 88
young
offset is 48
now 60
aged
offset is 88
now 101
retired
offset is 101
now 117
president


In [96]:
db.set("pavlos", "ancient")
db.index

currently 117


{'kobe': 88, 'obama': 101, 'pavlos': 117, 'rahul': 76}

In [97]:
print(db.get("rahul"))
print(db.get("pavlos"))
print(db.get("kobe"))
print(db.get("obama"))

offset is 76
now 88
young
offset is 117
now 132
ancient
offset is 88
now 101
retired
offset is 101
now 117
president


In [98]:
db.close()

In [99]:
db=Database("/tmp/test.db")
db.index

{'kobe': 88, 'obama': 101, 'pavlos': 117, 'rahul': 76}

In [100]:
db.delete('kobe')
db.index

currently 132


{'kobe': 132, 'obama': 101, 'pavlos': 117, 'rahul': 76}

In [101]:
db.delete("obama")
db.get("obama")

currently 157
offset is 157
now 183


ValueError: Key not Found

In [102]:
db.set("obama", "US president")

currently 183


In [103]:
db.index

{'kobe': 132, 'obama': 183, 'pavlos': 117, 'rahul': 76}

In [104]:
db.close()

In [105]:
db=Database("/tmp/test.db")
db.index

{'kobe': 132, 'obama': 183, 'pavlos': 117, 'rahul': 76}

In [106]:
db.get("obama")

offset is 183
now 202


'US president'

In [107]:
db.get("kobe")

offset is 132
now 157


ValueError: Key not Found