Author: Xinnan Shen<br>
Date: 01-09-2020

# Learned Index Model Classification Demo
This notebook is to build some simple learned index models based on Kraska's paper. (classification model)

In this model, I have used pages instead of real locations in databases in the dataset.

Step 1: Generate some simple dataset in csv format

In [2]:
import os
import codecs
import random
#function: data_generation
#usage: generate a simple dataset
#parameters:
#1.len_num: the size of dataset
#2.range_min: the minimum key
#3.range_max: the maximun key
#output dataset: two columns (key,location)
def data_generation(len_num,range_min,range_max):
	datalist=[]
	for i in range(0,len_num):
		x=random.randint(range_min,range_max)
		datalist.append(x)
	for i in range(0,len(datalist)):
		temp=False
		for j in range(0,len(datalist)-i-1):
			if datalist[j]>datalist[j+1]:
				t=datalist[j]
				datalist[j]=datalist[j+1]
				datalist[j+1]=t
				temp=True
		if not temp:
			break
	current_path=os.path.abspath(os.curdir)
	f=codecs.open(os.path.join(current_path,"data.csv"), "w", "utf-8")
	for i in range(0,len(datalist)):
		f.write(str(datalist[i])+","+str(int(i/(len_num/100)))+"\n")
	f.close()
	return


Provide some value and generate the dataset

In [3]:
minkey=1000
maxkey=9999
keynum=3000
data_generation(3000,1000,9999)

Step 2: Split the dataset into training, development and testing dataset

In [4]:
from random import shuffle
import numpy as np
from sklearn.model_selection import train_test_split
current_path=os.path.abspath(os.curdir)
f=codecs.open(os.path.join(current_path,"data.csv"), "r", "utf-8")
strlist=f.read().split("\n")
f.close()
list_key=[]
list_res=[]
for ele in strlist:
    temp=ele.split(",")
    if len(temp)!=2:
        continue
    list_key.append(temp[0])
    list_res.append(temp[1])
keys=np.array(list_key)
res=np.array(list_res)
trainkeys,testkeys,trainres,testres=train_test_split(keys,res,test_size=0.35)
trainkeys,devkeys,trainres,devres=train_test_split(trainkeys,trainres,test_size=0.5)
trainkeys=list(trainkeys)
devkeys=list(devkeys)
testkeys=list(testkeys)
trainres=list(trainres)
devres=list(devres)
testres=list(testres)

f=codecs.open(os.path.join(current_path,"data_train.csv"), "w", "utf-8")
for i in range(0,len(trainkeys)):
    f.write(str(trainkeys[i])+","+str(trainres[i])+"\n")
f.close()
f=codecs.open(os.path.join(current_path,"data_dev.csv"), "w", "utf-8")
for i in range(0,len(devkeys)):
    f.write(str(devkeys[i])+","+str(devres[i])+"\n")
f.close()
f=codecs.open(os.path.join(current_path,"data_test.csv"), "w", "utf-8")
for i in range(0,len(testkeys)):
    f.write(str(testkeys[i])+","+str(testres[i])+"\n")
f.close()
print("training data size:",len(trainkeys))
print("development data size:",len(devkeys))
print("testing data size:",len(testkeys))

training data size: 975
development data size: 975
testing data size: 1050


In [5]:
import codecs
import os
minkey=1000
maxkey=9999
keynum=3000
current_path=os.path.abspath(os.curdir)
f=codecs.open(os.path.join(current_path,"data_train.csv"), "r", "utf-8")
strlist=f.read().split("\n")
f.close()
trainkeys=[]
trainres=[]
for ele in strlist:
    temp=ele.split(",")
    if len(temp)!=2:
        continue
    trainkeys.append(int(temp[0]))
    trainres.append(int(temp[1]))
f=codecs.open(os.path.join(current_path,"data_dev.csv"), "r", "utf-8")
strlist=f.read().split("\n")
f.close()
devkeys=[]
devres=[]
for ele in strlist:
    temp=ele.split(",")
    if len(temp)!=2:
        continue
    devkeys.append(int(temp[0]))
    devres.append(int(temp[1]))
f=codecs.open(os.path.join(current_path,"data_test.csv"), "r", "utf-8")
strlist=f.read().split("\n")
f.close()
testkeys=[]
testres=[]
for ele in strlist:
    temp=ele.split(",")
    if len(temp)!=2:
        continue
    testkeys.append(int(temp[0]))
    testres.append(int(temp[1]))
print("training data size:",len(trainkeys))
print("development data size:",len(devkeys))
print("testing data size:",len(testkeys))

training data size: 975
development data size: 975
testing data size: 1050


Step 4: Build a model

- KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier
import time
import numpy as np
from sklearn.metrics import classification_report
t1=time.time()
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(np.array(trainkeys).reshape(-1,1),np.array(trainres).reshape(-1,1))
t2=time.time()
time_interval=t2-t1
devpre=neigh.predict(np.array(devkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
print(classification_report(devres,devpre))
print("time interval for building model:"+str(time_interval*1000)+" ms")
t1=time.time()
testpre=neigh.predict(np.array(testkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
t2=time.time()
time_interval=t2-t1
print("time interval for indexing data:"+str(time_interval*1000)+" ms")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.93      0.96        14
           2       0.93      1.00      0.96        13
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00         9
           5       0.93      1.00      0.96        13
           6       1.00      0.90      0.95        10
           7       0.87      1.00      0.93        13
           8       1.00      0.67      0.80         9
           9       0.89      0.89      0.89         9
          10       0.80      0.40      0.53        10
          11       0.62      0.91      0.74        11
          12       0.90      0.82      0.86        11
          13       0.77      1.00      0.87        10
          14       1.00      0.86      0.92         7
          15       1.00      1.00      1.00        10
          16       1.00      1.00      1.00         9
          17       1.00    

  import sys


- Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
import time
import numpy as np
from sklearn.metrics import classification_report
t1=time.time()
NB = GaussianNB()
NB.fit(np.array(trainkeys).reshape(-1,1),np.array(trainres).reshape(-1,1))
t2=time.time()
time_interval=t2-t1
devpre=NB.predict(np.array(devkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
print(classification_report(devres,devpre))
print("time interval for building model:"+str(time_interval*1000)+" ms")
t1=time.time()
testpre=NB.predict(np.array(testkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
t2=time.time()
time_interval=t2-t1
print("time interval for indexing data:"+str(time_interval*1000)+" ms")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       0.88      1.00      0.93        14
           2       1.00      0.69      0.82        13
           3       0.82      1.00      0.90         9
           4       1.00      1.00      1.00         9
           5       0.93      1.00      0.96        13
           6       1.00      0.90      0.95        10
           7       0.93      1.00      0.96        13
           8       1.00      0.78      0.88         9
           9       0.90      1.00      0.95         9
          10       1.00      0.40      0.57        10
          11       0.65      1.00      0.79        11
          12       1.00      0.82      0.90        11
          13       0.83      1.00      0.91        10
          14       1.00      1.00      1.00         7
          15       1.00      1.00      1.00        10
          16       1.00      1.00      1.00         9
          17       1.00    

  y = column_or_1d(y, warn=True)


- Decision Tree

In [17]:
from sklearn import tree
import time
import numpy as np
from sklearn.metrics import classification_report
t1=time.time()
tree = tree.DecisionTreeClassifier()
tree.fit(np.array(trainkeys).reshape(-1,1),np.array(trainres).reshape(-1,1))
t2=time.time()
time_interval=t2-t1
devpre=tree.predict(np.array(devkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
print(classification_report(devres,devpre))
print("time interval for building model:"+str(time_interval*1000)+" ms")
t1=time.time()
testpre=tree.predict(np.array(testkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
t2=time.time()
time_interval=t2-t1
print("time interval for indexing data:"+str(time_interval*1000)+" ms")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        13
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00         9
           5       0.93      1.00      0.96        13
           6       1.00      0.90      0.95        10
           7       0.93      1.00      0.96        13
           8       1.00      0.78      0.88         9
           9       0.89      0.89      0.89         9
          10       0.83      0.50      0.62        10
          11       0.69      1.00      0.81        11
          12       1.00      0.91      0.95        11
          13       0.83      1.00      0.91        10
          14       1.00      0.86      0.92         7
          15       1.00      1.00      1.00        10
          16       1.00      1.00      1.00         9
          17       1.00    

- Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
import time
import numpy as np
from sklearn.metrics import classification_report
t1=time.time()
forest = RandomForestClassifier()
forest.fit(np.array(trainkeys).reshape(-1,1),np.array(trainres).reshape(-1,1))
t2=time.time()
time_interval=t2-t1
devpre=forest.predict(np.array(devkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
print(classification_report(devres,devpre))
print("time interval for building model:"+str(time_interval*1000)+" ms")
t1=time.time()
testpre=forest.predict(np.array(testkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
t2=time.time()
time_interval=t2-t1
print("time interval for indexing data:"+str(time_interval*1000)+" ms")

  import sys


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        13
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00         9
           5       0.93      1.00      0.96        13
           6       1.00      0.90      0.95        10
           7       0.93      1.00      0.96        13
           8       1.00      0.78      0.88         9
           9       0.89      0.89      0.89         9
          10       0.83      0.50      0.62        10
          11       0.69      1.00      0.81        11
          12       1.00      0.91      0.95        11
          13       0.83      1.00      0.91        10
          14       1.00      0.86      0.92         7
          15       1.00      1.00      1.00        10
          16       1.00      1.00      1.00         9
          17       1.00    