# Test pretrained model to make sure it works correctly

In [None]:
!python3 -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.3MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp37-none-any.whl size=98051305 sha256=751593994cf7e8f26305e612d6fc4098a74365e58ef7575d982386fcebc88ea8
  Stored in directory: /tmp/pip-ephem-wheel-cache-easu08xl/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('en_core_web_md')
# process a sentence using the model
doc = nlp("This is some text that I am processing with Spacy")
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
doc[3].vector
# Get the mean vector for the entire sentence (useful for sentence classification etc.)
# doc.vector

array([ 0.037103 , -0.31259  , -0.17857  ,  0.30001  ,  0.078154 ,
        0.17958  ,  0.12048  , -0.11879  , -0.20601  ,  1.2849   ,
       -0.20409  ,  0.80613  ,  0.34344  , -0.19191  , -0.084511 ,
        0.17339  ,  0.042483 ,  2.0282   , -0.16278  , -0.60306  ,
       -0.53766  ,  0.35711  ,  0.22882  ,  0.1171   ,  0.42983  ,
        0.16165  ,  0.407    ,  0.036476 ,  0.52636  , -0.13524  ,
       -0.016897 ,  0.029259 , -0.079115 , -0.32305  ,  0.052255 ,
       -0.3617   , -0.18355  , -0.34717  , -0.3691   ,  0.16881  ,
        0.21018  , -0.38376  , -0.096909 , -0.36296  , -0.37319  ,
        0.0021152,  0.32512  ,  0.063977 ,  0.36249  , -0.26935  ,
       -0.59341  , -0.13625  ,  0.016425 , -0.2474   , -0.07498  ,
        0.034708 , -0.01476  , -0.11648  ,  0.25559  , -0.35002  ,
       -0.52707  ,  0.21221  ,  0.062456 ,  0.26184  ,  0.53149  ,
        0.34957  , -0.22692  ,  0.44076  ,  0.4438   ,  0.6335   ,
       -0.049757 , -0.08134  ,  0.65618  , -0.4716   ,  0.0906

# Read dataset

## read data

In [None]:
import codecs
import os
minkey=1000
maxkey=9999
keynum=3000
current_path=os.path.abspath(os.curdir)
f=codecs.open(os.path.join(current_path,"data.csv"), "r", "utf-8")
strlist=f.read().split("\n")
f.close()
trainkeys=[]
trainres=[]
for ele in strlist:
    temp=ele.split(",")
    if len(temp)!=2:
        continue
    trainkeys.append(temp[0])
    trainres.append(int(temp[1]))
# f=codecs.open(os.path.join(current_path,"data_dev.csv"), "r", "utf-8")
# strlist=f.read().split("\n")
# f.close()
# devkeys=[]
# devres=[]
# for ele in strlist:
#     temp=ele.split(",")
#     if len(temp)!=2:
#         continue
#     devkeys.append(int(temp[0]))
#     devres.append(int(temp[1]))
f=codecs.open(os.path.join(current_path,"data_test.csv"), "r", "utf-8")
strlist=f.read().split("\n")
f.close()
testkeys=[]
testres=[]
for ele in strlist:
    temp=ele.split(",")
    if len(temp)!=2:
        continue
    testkeys.append(temp[0])
    testres.append(int(temp[1]))

# It is very time and space consuming to build models based on the entire dataset
# Instead, we divide the dataset into 3 parts (training, dev, testing)
# We build and train models based on training set, and give index predictions based on testing set

# trainkeys.extend(devkeys)
# trainres.extend(devres)
# trainkeys.extend(testkeys)
# trainres.extend(testres)

print("training data size:",len(trainkeys))
# print("development data size:",len(devkeys))
print("testing data size:",len(testkeys))

training data size: 61543
testing data size: 20514


## Data preprocessing

In [None]:
import spacy
# Load the spacy model that you have installed
pretrain = spacy.load('en_core_web_md')
trainkeys_emb=[]
# process a sentence using the model
for ele in trainkeys:
  trainkeys_emb.append(pretrain(ele).vector)
testkeys_emb=[]
for ele in testkeys:
  testkeys_emb.append(pretrain(ele).vector)

In [None]:
trainpages=[]
for ele in trainres:
  trainpages.append(int(ele)//100)
testpages=[]
for ele in testres:
  testpages.append(int(ele)//100)

In [None]:
import numpy as np
R_train=np.array(trainkeys)
X_train=np.array(trainkeys_emb)
Y_train=np.array(trainres).reshape(-1,1)
Z_train=np.array(trainpages).reshape(-1,1)
R_test=np.array(testkeys)
X_test=np.array(testkeys_emb)
Y_test=np.array(testres).reshape(-1,1)
Z_test=np.array(testpages).reshape(-1,1)

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# Build Models

## B-Tree

In [None]:
import time
# ref: https://peefy.github.io/blog/2018/06/10/Python-BTree/
class BTreeNode:
    '''
    B树结点
    '''
    def __init__(self, n = 0, isleaf = True):
        '''
        B树结点

        Args
        ===
        `n` : 结点包含关键字的数量

        `isleaf` : 是否是叶子节点

        '''
        # 结点包含关键字的数量
        self.n = n
        # 关键字的值数组
        self.keys = []
        # 子结点数组
        self.children = []
        # 是否是叶子节点
        self.isleaf = isleaf

    def __str__(self):

        returnStr = 'keys:['
        for i in range(self.n):
            returnStr += str(self.keys[i]) + ' '
        returnStr += '];childrens:['
        for child in self.children:
            returnStr += str(child) + ';'
        returnStr += ']\r\n'
        return returnStr

    def diskread(self):
        '''
        磁盘读
        '''
        pass

    def diskwrite(self):
        '''
        磁盘写
        '''
        pass

    @classmethod
    def allocate_node(self, key_max):
        '''
        在O(1)时间内为一个新结点分配一个磁盘页

        假定由ALLOCATE-NODE所创建的结点无需做DISK-READ，因为磁盘上还没有关于该结点的有用信息

        Return
        ===
        `btreenode` : 分配的B树结点

        Example
        ===
        ```python
        btreenode = BTreeNode.allocate_node()
        ```
        '''
        node = BTreeNode()
        child_max = key_max + 1
        for i in range(key_max):
            node.keys.append(None)
        for i in range(child_max):
            node.children.append(None)
        return node

class BTree:
    '''
    B树
    '''
    def __init__(self, m = 3):
        '''
        B树的定义
        '''
        # B树的最小度数
        self.M = m
        # 节点包含关键字的最大个数
        self.KEY_MAX = 2 * self.M - 1
        # 非根结点包含关键字的最小个数
        self.KEY_MIN = self.M - 1
        # 子结点的最大个数
        self.CHILD_MAX = self.KEY_MAX + 1
        # 子结点的最小个数
        self.CHILD_MIN = self.KEY_MIN + 1
        # 根结点
        self.root: BTreeNode = None

    def __new_node(self):
        '''
        创建新的B树结点
        '''
        return BTreeNode.allocate_node(self.KEY_MAX)

    def insert(self, key):
        '''
        向B树中插入新结点`key`  
        '''
        # 检查关键字是否存在
        if self.contain(key) == True:
            return False
        else:
            # 检查是否为空树
            if self.root is None:
                node = self.__new_node()
                node.diskwrite()
                self.root = node    
            # 检查根结点是否已满      
            if self.root.n == self.KEY_MAX:
                # 创建新的根结点
                pNode = self.__new_node()
                pNode.isleaf = False
                pNode.children[0] = self.root
                self.__split_child(pNode, 0, self.root)
                # 更新结点指针
                self.root = pNode
            self.__insert_non_full(self.root, key)
            return True

    def remove(self, key): 
        '''
        从B中删除结点`key`
        '''      
        # 如果关键字不存在
        if not self.search(self.root, key):
            return False
        # 特殊情况处理
        if self.root.n == 1:
            if self.root.isleaf == True:
                self.clear()
            else:
                pChild1 = self.root.children[0]
                pChild2 = self.root.children[1]
                if pChild1.n == self.KEY_MIN and pChild2.n == self.KEY_MIN:
                    self.__merge_child(self.root, 0)
                    self.__delete_node(self.root)
                    self.root = pChild1
        self.__recursive_remove(self.root, key)
        return True
    
    def display(self):
        '''
        打印树的关键字  
        '''
        self.__display_in_concavo(self.root, self.KEY_MAX * 10)

    def contain(self, key):
        '''
        检查该`key`是否存在于B树中  
        '''
        self.__search(self.root, key)

    def clear(self):
        '''
        清空B树  
        '''
        self.__recursive_clear(self.root)
        self.root = None

    def __recursive_clear(self, pNode : BTreeNode):
        '''
        删除树  
        '''
        if pNode is not None:
            if not pNode.isleaf:
                for i in range(pNode.n):
                    self.__recursive_clear(pNode.children[i])
            self.__delete_node(pNode)

    def __delete_node(self, pNode : BTreeNode):
        '''
        删除节点 
        '''
        if pNode is not None:
            pNode = None
    
    def __search(self, pNode : BTreeNode, key):
        '''
        查找关键字  
        '''
        # 检测结点是否为空，或者该结点是否为叶子节点
        if pNode is None:
            return False
        else:
            i = 0
            # 找到使key < pNode.keys[i]成立的最小下标
            while i < pNode.n and key > pNode.keys[i]:
                i += 1
            if i < pNode.n and key == pNode.keys[i]:
                return True
            else:
                # 检查该结点是否为叶子节点
                if pNode.isleaf == True:
                    return False
                else:
                    return self.__search(pNode.children[i], key)

    def __split_child(self, pParent : BTreeNode, nChildIndex, pChild : BTreeNode):
        '''
        分裂子节点
        '''
        # 将pChild分裂成pLeftChild和pChild两个结点
        pRightNode = self.__new_node()  # 分裂后的右结点
        pRightNode.isleaf = pChild.isleaf
        pRightNode.n = self.KEY_MIN
        # 拷贝关键字的值
        for i in range(self.KEY_MIN):
            pRightNode.keys[i] = pChild.keys[i + self.CHILD_MIN]
        # 如果不是叶子结点，就拷贝孩子结点指针
        if not pChild.isleaf:
            for i in range(self.CHILD_MIN):
                pRightNode.children[i] = pChild.children[i + self.CHILD_MIN]
        # 更新左子树的关键字个数
        pChild.n = self.KEY_MIN
        # 将父结点中的pChildIndex后的所有关键字的值和子树指针向后移动一位
        for i in range(nChildIndex, pParent.n):
            j = pParent.n + nChildIndex - i
            pParent.children[j + 1] = pParent.children[j]
            pParent.keys[j] = pParent.keys[j - 1]
        # 更新父结点的关键字个数
        pParent.n += 1
        # 存储右子树指针
        pParent.children[nChildIndex + 1] = pRightNode
        # 把结点的中间值提到父结点
        pParent.keys[nChildIndex] = pChild.keys[self.KEY_MIN]
        pChild.diskwrite()
        pRightNode.diskwrite()
        pParent.diskwrite()
    
    def __insert_non_full(self, pNode: BTreeNode, key):
        '''
        在非满节点中插入关键字
        '''
        # 获取结点内关键字个数
        i = pNode.n
        # 如果pNode是叶子结点
        if pNode.isleaf == True:
            # 从后往前 查找关键字的插入位置
            while i > 0 and key < pNode.keys[i - 1]:
                # 向后移位
                pNode.keys[i] = pNode.keys[i - 1]
                i -= 1
            # 插入关键字的值
            pNode.keys[i] = key
            # 更新结点关键字的个数
            pNode.n += 1
            pNode.diskwrite()
        # pnode是内结点
        else:
            # 从后往前 查找关键字的插入的子树
            while i > 0 and key < pNode.keys[i - 1]:
                i -= 1
            # 目标子树结点指针
            pChild = pNode.children[i]
            pNode.children[i].diskread()
            # 子树结点已经满了
            if pChild.n == self.KEY_MAX:
                # 分裂子树结点
                self.__split_child(pNode, i, pChild)
                # 确定目标子树
                if key > pNode.keys[i]:
                    pChild = pNode.children[i + 1]
            # 插入关键字到目标子树结点
            self.__insert_non_full(pChild, key)

    def __display_in_concavo(self, pNode: BTreeNode, count):
        '''
        用括号打印树 
        '''
        if pNode is not None:
            i = 0
            j = 0
            for i in range(pNode.n):
                if not pNode.isleaf:
                    self.__display_in_concavo(pNode.children[i], count - 2)
                for j in range(-1, count):
                    k = count - j - 1
                    print('-', end='')
                print(pNode.keys[i])
            if not pNode.isleaf:
                self.__display_in_concavo(pNode.children[i], count - 2)

    def __merge_child(self, pParent: BTreeNode, index):
        '''
        合并两个子结点
        '''
        pChild1 = pParent.children[index]
        pChild2 = pParent.children[index + 1]
        # 将pChild2数据合并到pChild1
        pChild1.n = self.KEY_MAX
        # 将父结点index的值下移
        pChild1.keys[self.KEY_MIN] = pParent.keys[index]
        for i in range(self.KEY_MIN):
            pChild1.keys[i + self.KEY_MIN + 1] = pChild2.keys[i]
        if not pChild1.isleaf:
            for i in range(self.CHILD_MIN):
                pChild1.children[i + self.CHILD_MIN] = pChild2.children[i]
        # 父结点删除index的key，index后的往前移一位
        pParent.n -= 1
        for i in range(index, pParent.n):
            pParent.keys[i] = pParent.keys[i + 1]
            pParent.children[i + 1] = pParent.children[i + 2]
        # 删除pChild2
        self.__delete_node(pChild2)

    def __recursive_remove(self, pNode: BTreeNode, key):
        '''
        递归的删除关键字`key`  
        '''
        i = 0
        while i < pNode.n and key > pNode.keys[i]:
            i += 1
        # 关键字key在结点pNode
        if i < pNode.n and key == pNode.keys[i]:
            # pNode是个叶结点
            if pNode.isleaf == True:
                # 从pNode中删除k
                for j in range(i, pNode.n):
                    pNode.keys[j] = pNode.keys[j + 1]
                return
            # pNode是个内结点
            else:
                # 结点pNode中前于key的子结点
                pChildPrev = pNode.children[i]
                # 结点pNode中后于key的子结点
                pChildNext = pNode.children[i + 1]
                if pChildPrev.n >= self.CHILD_MIN:
                    # 获取key的前驱关键字
                    prevKey = self.predecessor(pChildPrev)
                    self.__recursive_remove(pChildPrev, prevKey)
                    # 替换成key的前驱关键字
                    pNode.keys[i] = prevKey
                    return
                # 结点pChildNext中至少包含CHILD_MIN个关键字
                elif pChildNext.n >= self.CHILD_MIN:
                    # 获取key的后继关键字
                    nextKey = self.successor(pChildNext)
                    self.__recursive_remove(pChildNext, nextKey)
                    # 替换成key的后继关键字
                    pNode.keys[i] = nextKey
                    return
                # 结点pChildPrev和pChildNext中都只包含CHILD_MIN-1个关键字
                else:
                    self.__merge_child(pNode, i)
                    self.__recursive_remove(pChildPrev, key)
        # 关键字key不在结点pNode中
        else:
            # 包含key的子树根结点
            pChildNode = pNode.children[i]
            # 只有t-1个关键字
            if pChildNode.n == self.KEY_MAX:
                # 左兄弟结点
                pLeft = None
                # 右兄弟结点
                pRight = None
                # 左兄弟结点
                if i > 0:
                    pLeft = pNode.children[i - 1]
                # 右兄弟结点
                if i < pNode.n:
                    pRight = pNode.children[i + 1]
                j = 0
                if pLeft is not None and pLeft.n >= self.CHILD_MIN:
                    # 父结点中i-1的关键字下移至pChildNode中
                    for j in range(pChildNode.n):
                        k = pChildNode.n - j
                        pChildNode.keys[k] = pChildNode.keys[k - 1]
                    pChildNode.keys[0] = pNode.keys[i - 1]
                    if not pLeft.isleaf:
                        # pLeft结点中合适的子女指针移到pChildNode中
                        for j in range(pChildNode.n + 1):
                            k = pChildNode.n + 1 - j
                            pChildNode.children[k] = pChildNode.children[k - 1]
                        pChildNode.children[0] = pLeft.children[pLeft.n]
                    pChildNode.n += 1
                    pNode.keys[i] = pLeft.keys[pLeft.n - 1]
                    pLeft.n -= 1
                # 右兄弟结点至少有CHILD_MIN个关键字
                elif pRight is not None and pRight.n >= self.CHILD_MIN:
                    # 父结点中i的关键字下移至pChildNode中
                    pChildNode.keys[pChildNode.n] = pNode.keys[i]
                    pChildNode.n += 1
                    # pRight结点中的最小关键字上升到pNode中
                    pNode.keys[i] = pRight.keys[0]
                    pRight.n -= 1
                    for j in range(pRight.n):
                        pRight.keys[j] = pRight.keys[j + 1]
                    if not pRight.isleaf:
                        # pRight结点中合适的子女指针移动到pChildNode中
                        pChildNode.children[pChildNode.n] = pRight.children[0]
                        for j in range(pRight.n):
                            pRight.children[j] = pRight.children[j + 1]
                # 左右兄弟结点都只包含CHILD_MIN-1个结点
                elif pLeft is not None:
                    self.__merge_child(pNode, i - 1)
                    pChildNode = pLeft
                # 与右兄弟合并
                elif pRight is not None:
                    self.__merge_child(pNode, i)
            self.__recursive_remove(pChildNode, key)

    def predecessor(self, pNode: BTreeNode):
        '''
        前驱关键字
        '''
        while not pNode.isleaf:
            pNode = pNode.children[pNode.n]
        return pNode.keys[pNode.n - 1]

    def successor(self, pNode: BTreeNode):
        '''
        后继关键字
        '''
        while not pNode.isleaf:
            pNode = pNode.children[0]
        return pNode.keys[0]

def test():
    '''
    test class `BTree` and class `BTreeNode`
    '''
    tree = BTree(10)
    
    t1=time.time()
    for i in range(0,len(trainkeys)):
        tree.insert(trainkeys[i])
    t2=time.time()
    time_interval=t2-t1
    print("time interval for building model:"+str(time_interval*1000)+" ms")
    ret1=time_interval*1000
    t1=time.time()
    # testpre=[]
    for i in range(0,len(testkeys)):
        tree.contain(testkeys[i])
    t2=time.time()
    time_interval=t2-t1
    print("time interval for indexing data :"+str(time_interval*1000)+" ms")
    print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
    ret2=time_interval*1000
    ret3=time_interval/len(testkeys)*1000
    return (ret1,ret2,ret3)

if __name__ == '__main__':
    avg_a=0.0
    avg_b=0.0
    avg_c=0.0
    counting=20
    for i in range(0,20):
        (a,b,c)=test()
        avg_a+=a
        avg_b+=b
        avg_c+=c
    avg_a=avg_a/counting
    avg_b=avg_b/counting
    avg_c=avg_c/counting
    print("average times (ms):",avg_a,avg_b,avg_c)
else:
    pass

time interval for building model:181.47802352905273 ms
time interval for indexing data :8.741378784179688 ms
average time interval for indexing data :0.006288761715237185 ms
time interval for building model:53.22456359863281 ms
time interval for indexing data :9.008169174194336 ms
average time interval for indexing data :0.006480697247621824 ms
time interval for building model:54.76093292236328 ms
time interval for indexing data :9.122848510742188 ms
average time interval for indexing data :0.006563200367440423 ms
time interval for building model:53.444623947143555 ms
time interval for indexing data :9.528398513793945 ms
average time interval for indexing data :0.00685496295956399 ms
time interval for building model:57.19304084777832 ms
time interval for indexing data :9.490251541137695 ms
average time interval for indexing data :0.0068275190943436655 ms
time interval for building model:60.2266788482666 ms
time interval for indexing data :9.469032287597656 ms
average time interval for 

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error 
import math
import time
# print("Linear Regression Model")
def test():
  t1=time.time()
  reg = LinearRegression()
  reg.fit(X_train,Y_train)
  t2=time.time()
  time_interval=t2-t1
  print("time interval for building model:"+str(time_interval*1000)+" ms")
  ret1=time_interval*1000
  # devpre=reg.predict(np.array(devkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
  # for i in range(0,len(devpre)):
  #     devpre[i]=abs(int(devpre[i]))
  # mse_LR=mean_squared_error(devres,devpre)
  # print("MSE dev: ",mse_LR)
  t1=time.time()
  testpre=reg.predict(X_test).reshape(1,-1).tolist()[0]
  for i in range(0,len(testpre)):
    testpre[i]=abs(int(testpre[i]))
  t2=time.time()
  time_interval=t2-t1
  print("time interval for indexing data :"+str(time_interval*1000)+" ms")
  print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret2=time_interval*1000
  ret3=time_interval/len(testkeys)*1000
  # print("log MSE test: ",round(math.log(1+mean_squared_error(testres,testpre),2),3))
  t1=time.time()
  count_error=0
  for i in range(0,len(testpre)):
    estimated_loc=testpre[i]
    correct_res=testres[i]
    if estimated_loc>=0 and estimated_loc<len(trainkeys):
      finding_res=trainkeys[estimated_loc]
    elif estimated_loc<0:
      finding_res=trainkeys[0]
    else:
      finding_res=trainkeys[len(trainkeys)-1]
    if finding_res!=correct_res:
      count_error+=1
    begin=0
    end=len(trainkeys)-1
    # while finding_res!=correct_res:
      
    #   # # print(finding_res,correct_res)
    #   # if count_error>30:
    #   #   return
    #   if finding_res<correct_res:
    #     begin=estimated_loc
    #     # end=len(trainkeys)-1
    #     estimated_loc=(begin+end)//2
    #     if estimated_loc>=0 and estimated_loc<len(trainkeys):
    #       finding_res=trainkeys[estimated_loc]
    #     elif estimated_loc<0:
    #       finding_res=trainkeys[0]
    #     else:
    #       finding_res=trainkeys[len(trainkeys)-1]
    #   else:
    #     # begin=0
    #     end=estimated_loc
    #     estimated_loc=(begin+end)//2
    #     if estimated_loc>=0 and estimated_loc<len(trainkeys):
    #       finding_res=trainkeys[estimated_loc]
    #     elif estimated_loc<0:
    #       finding_res=trainkeys[0]
    #     else:
    #       finding_res=trainkeys[len(trainkeys)-1]
    i=begin
    while i<=end:
      # print(i,end)
      if finding_res==trainkeys[i]:
        break
      else:
        i=i+1
  t2=time.time()
  time_interval=t2-t1
  print("time interval for error correction :"+str(time_interval*1000)+" ms")
  print("average time interval for error correction :"+str(time_interval/count_error*1000)+" ms")
  ret4=time_interval*1000
  ret5=time_interval/len(testkeys)*1000
  return (ret1,ret2,ret3,ret4,ret5)
avg_a=0.0
avg_b=0.0
avg_c=0.0
avg_d=0.0
avg_e=0.0
counting=20
for i in range(0,20):
  (a,b,c,d,e)=test()
  avg_a+=a
  avg_b+=b
  avg_c+=c
  avg_d+=d
  avg_e+=e
avg_a=avg_a/counting
avg_b=avg_b/counting
avg_c=avg_c/counting
avg_d=avg_d/counting
avg_e=avg_e/counting
print("average times (ms):",avg_a,avg_b,avg_c,avg_d,avg_e)

time interval for building model:53.664207458496094 ms
time interval for indexing data :1.4972686767578125 ms
average time interval for indexing data :0.0010771717098977068 ms
time interval for error correction :317.9914951324463 ms
average time interval for error correction :0.22877085980751533 ms
time interval for building model:57.25741386413574 ms
time interval for indexing data :1.5833377838134766 ms
average time interval for indexing data :0.0011390919308010624 ms
time interval for error correction :314.28027153015137 ms
average time interval for error correction :0.22610091476989308 ms
time interval for building model:52.82473564147949 ms
time interval for indexing data :1.5990734100341797 ms
average time interval for indexing data :0.0011504125252044459 ms
time interval for error correction :319.4701671600342 ms
average time interval for error correction :0.22983465263311811 ms
time interval for building model:53.86519432067871 ms
time interval for indexing data :1.650094985961

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.metrics import mean_squared_error 
import math
import time
def test():
  t1=time.time()
  reg = Ridge(alpha=0.1)
  reg.fit(X_train,Y_train)
  t2=time.time()
  time_interval=t2-t1
  print("time interval for building model:"+str(time_interval*1000)+" ms")
  ret1=time_interval*1000
  # devpre=reg.predict(np.array(devkeys).reshape(-1,1)).reshape(1,-1).tolist()[0]
  # for i in range(0,len(devpre)):
  #     devpre[i]=abs(int(devpre[i]))
  # mse_LR=mean_squared_error(devres,devpre)
  # print("MSE dev: ",mse_LR)
  t1=time.time()
  testpre=reg.predict(X_test).reshape(1,-1).tolist()[0]
  for i in range(0,len(testpre)):
    testpre[i]=abs(int(testpre[i]))
  t2=time.time()
  time_interval=t2-t1
  print("time interval for indexing data :"+str(time_interval*1000)+" ms")
  print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret2=time_interval*1000
  ret3=time_interval/len(testkeys)*1000
  # print("log MSE test: ",round(math.log(1+mean_squared_error(testres,testpre),2),3))
  t1=time.time()
  count_error=0
  for i in range(0,len(testpre)):
    estimated_loc=testpre[i]
    correct_res=testres[i]
    if estimated_loc>=0 and estimated_loc<len(trainkeys):
      finding_res=trainkeys[estimated_loc]
    elif estimated_loc<0:
      finding_res=trainkeys[0]
    else:
      finding_res=trainkeys[len(trainkeys)-1]
    if finding_res!=correct_res:
      count_error+=1
    begin=0
    end=len(trainkeys)-1
    # while finding_res!=correct_res:
      
    #   # # print(finding_res,correct_res)
    #   # if count_error>30:
    #   #   return
    #   if finding_res<correct_res:
    #     begin=estimated_loc
    #     # end=len(trainkeys)-1
    #     estimated_loc=(begin+end)//2
    #     if estimated_loc>=0 and estimated_loc<len(trainkeys):
    #       finding_res=trainkeys[estimated_loc]
    #     elif estimated_loc<0:
    #       finding_res=trainkeys[0]
    #     else:
    #       finding_res=trainkeys[len(trainkeys)-1]
    #   else:
    #     # begin=0
    #     end=estimated_loc
    #     estimated_loc=(begin+end)//2
    #     if estimated_loc>=0 and estimated_loc<len(trainkeys):
    #       finding_res=trainkeys[estimated_loc]
    #     elif estimated_loc<0:
    #       finding_res=trainkeys[0]
    #     else:
    #       finding_res=trainkeys[len(trainkeys)-1]
    i=begin
    while i<=end:
      if finding_res==trainkeys[i]:
        break
      else:
        i=i+1
  t2=time.time()
  time_interval=t2-t1
  print("time interval for error correction :"+str(time_interval*1000)+" ms")
  print("average time interval for error correction :"+str(time_interval/count_error*1000)+" ms")
  ret4=time_interval*1000
  ret5=time_interval/count_error*1000
  return (ret1,ret2,ret3,ret4,ret5)
avg_a=0.0
avg_b=0.0
avg_c=0.0
avg_d=0.0
avg_e=0.0
counting=20
for i in range(0,20):
  (a,b,c,d,e)=test()
  avg_a+=a
  avg_b+=b
  avg_c+=c
  avg_d+=d
  avg_e+=e
avg_a=avg_a/counting
avg_b=avg_b/counting
avg_c=avg_c/counting
avg_d=avg_d/counting
avg_e=avg_e/counting
print("average times (ms):",avg_a,avg_b,avg_c,avg_d,avg_e)

time interval for building model:16.058683395385742 ms
time interval for indexing data :2.3736953735351562 ms
average time interval for indexing data :0.0017076945133346448 ms
time interval for error correction :357.03039169311523 ms
average time interval for error correction :0.2568563969015218 ms
time interval for building model:15.332460403442383 ms
time interval for indexing data :2.542734146118164 ms
average time interval for indexing data :0.0018293051410922044 ms
time interval for error correction :347.3024368286133 ms
average time interval for error correction :0.24985786822202394 ms
time interval for building model:21.730899810791016 ms
time interval for indexing data :2.4759769439697266 ms
average time interval for indexing data :0.0017812783769566379 ms
time interval for error correction :343.780517578125 ms
average time interval for error correction :0.24732411336555754 ms
time interval for building model:16.062259674072266 ms
time interval for indexing data :2.528905868530

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
import time
import numpy as np
from sklearn.metrics import classification_report
def test():
  t1=time.time()
  NB = GaussianNB()
  NB.fit(X_train,Z_train)
  t2=time.time()
  time_interval=t2-t1
  # devpre=NB.predict(X_dev)#.reshape(1,-1).tolist()[0]
  # print(classification_report(Y_dev,devpre))
  print("time interval for building model:"+str(time_interval*1000)+" ms")
  ret1=time_interval*1000
  t1=time.time()
  testpre=NB.predict(X_test).reshape(1,-1).tolist()[0]
  t2=time.time()
  time_interval=t2-t1
  print("time interval for indexing data :"+str(time_interval*1000)+" ms")
  print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret2=time_interval*1000
  ret3=time_interval/len(testkeys)*1000
  t1=time.time()
  for i in range(0,len(testpre)):
    estimated_page=testpre[i]
    correct_res=testres[i]
    if correct_res in range(estimated_page*100,estimated_page*100+100):
      pass
    else:
      estimated_page=correct_res//100
    begin=estimated_page*100
    end=estimated_page*100+100
    while begin<end:
      middle=(begin+end)//2
      if middle==correct_res:
        estimated_loc=middle
        break
      elif middle<correct_res:
        begin=middle
      else:
        end=middle
  t2=time.time()
  time_interval=t2-t1
  print("time interval for error correction :"+str(time_interval*1000)+" ms")
  print("average time interval for error correction :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret4=time_interval*1000
  ret5=time_interval/len(testkeys)*1000
  return (ret1,ret2,ret3,ret4,ret5)
avg_a=0.0
avg_b=0.0
avg_c=0.0
avg_d=0.0
avg_e=0.0
counting=20
for i in range(0,20):
  (a,b,c,d,e)=test()
  avg_a+=a
  avg_b+=b
  avg_c+=c
  avg_d+=d
  avg_e+=e
avg_a=avg_a/counting
avg_b=avg_b/counting
avg_c=avg_c/counting
avg_d=avg_d/counting
avg_e=avg_e/counting
print("average times (ms):",avg_a,avg_b,avg_c,avg_d,avg_e)

time interval for building model:10.509014129638672 ms
time interval for indexing data :98.35362434387207 ms
average time interval for indexing data :0.07075800312508783 ms
time interval for error correction :2.4483203887939453 ms
average time interval for error correction :0.001761381574671903 ms
time interval for building model:10.092496871948242 ms
time interval for indexing data :96.90403938293457 ms
average time interval for indexing data :0.06971513624671552 ms
time interval for error correction :2.111196517944336 ms
average time interval for error correction :0.001518846415787292 ms
time interval for building model:9.640693664550781 ms
time interval for indexing data :86.29345893859863 ms
average time interval for indexing data :0.06208162513568247 ms
time interval for error correction :2.133607864379883 ms
average time interval for error correction :0.0015349696866042322 ms
time interval for building model:9.632349014282227 ms
time interval for indexing data :90.77239036560059 

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import time
import numpy as np
from sklearn.metrics import classification_report
def test():
  t1=time.time()
  neigh = KNeighborsClassifier(n_neighbors=2)
  neigh.fit(X_train,Z_train)
  t2=time.time()
  time_interval=t2-t1
  # devpre=neigh.predict(X_dev)#.reshape(1,-1).tolist()[0]
  # print(classification_report(Y_dev,devpre))
  print("time interval for building model:"+str(time_interval*1000)+" ms")
  ret1=time_interval*1000
  t1=time.time()
  testpre=neigh.predict(X_test).reshape(1,-1).tolist()[0]
  # print(testpre)
  t2=time.time()
  time_interval=t2-t1
  print("time interval for indexing data :"+str(time_interval*1000)+" ms")
  print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret2=time_interval*1000
  ret3=time_interval/len(testkeys)*1000
  t1=time.time()
  for i in range(0,len(testpre)):
    estimated_page=testpre[i]
    correct_res=testres[i]
    if correct_res in range(estimated_page*100,estimated_page*100+100):
      pass
    else:
      estimated_page=correct_res//100
    begin=estimated_page*100
    end=estimated_page*100+100
    while begin<end:
      middle=(begin+end)//2
      if middle==correct_res:
        estimated_loc=middle
        break
      elif middle<correct_res:
        begin=middle
      else:
        end=middle
      # print("begin="+str(begin)+",end="+str(end))
  t2=time.time()
  time_interval=t2-t1
  print("time interval for error correction :"+str(time_interval*1000)+" ms")
  print("average time interval for error correction :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret4=time_interval*1000
  ret5=time_interval/len(testkeys)*1000
  return (ret1,ret2,ret3,ret4,ret5)
avg_a=0.0
avg_b=0.0
avg_c=0.0
avg_d=0.0
avg_e=0.0
counting=20
for i in range(0,20):
  (a,b,c,d,e)=test()
  avg_a+=a
  avg_b+=b
  avg_c+=c
  avg_d+=d
  avg_e+=e
avg_a=avg_a/counting
avg_b=avg_b/counting
avg_c=avg_c/counting
avg_d=avg_d/counting
avg_e=avg_e/counting
print("average times (ms):",avg_a,avg_b,avg_c,avg_d,avg_e)

time interval for building model:4332.1967124938965 ms
time interval for indexing data :992711.7531299591 ms
average time interval for indexing data :48.391915429948284 ms
time interval for error correction :29.662370681762695 ms
average time interval for error correction :0.0014459574281838108 ms
time interval for building model:4092.038869857788 ms


## Decision Tree

In [None]:
from sklearn import tree
import time
import numpy as np
from sklearn.metrics import classification_report
def test():
  t1=time.time()
  dtree = tree.DecisionTreeClassifier(max_depth=None)
  dtree.fit(X_train,Z_train)
  t2=time.time()
  time_interval=t2-t1
  # devpre=tree.predict(X_dev)#.reshape(1,-1).tolist()[0]
  # print(classification_report(Y_dev,devpre))
  print("time interval for building model:"+str(time_interval*1000)+" ms")
  ret1=time_interval*1000
  t1=time.time()
  testpre=dtree.predict(X_test).reshape(1,-1).tolist()[0]
  t2=time.time()
  time_interval=t2-t1
  print("time interval for indexing data :"+str(time_interval*1000)+" ms")
  print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret2=time_interval*1000
  ret3=time_interval/len(testkeys)*1000
  t1=time.time()
  for i in range(0,len(testpre)):
    estimated_page=testpre[i]
    correct_res=testres[i]
    if correct_res in range(estimated_page*100,estimated_page*100+100):
      pass
    else:
      estimated_page=correct_res//100
    begin=estimated_page*100
    end=estimated_page*100+100
    while begin<end and end-begin>1:
      middle=(begin+end)//2
      if middle==correct_res:
        estimated_loc=middle
        break
      elif middle<correct_res:
        begin=middle
      else:
        end=middle
  t2=time.time()
  time_interval=t2-t1
  print("time interval for error correction :"+str(time_interval*1000)+" ms")
  print("average time interval for error correction :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret4=time_interval*1000
  ret5=time_interval/len(testkeys)*1000
  return (ret1,ret2,ret3,ret4,ret5)
avg_a=0.0
avg_b=0.0
avg_c=0.0
avg_d=0.0
avg_e=0.0
counting=20
for i in range(0,20):
  (a,b,c,d,e)=test()
  avg_a+=a
  avg_b+=b
  avg_c+=c
  avg_d+=d
  avg_e+=e
avg_a=avg_a/counting
avg_b=avg_b/counting
avg_c=avg_c/counting
avg_d=avg_d/counting
avg_e=avg_e/counting
print("average times (ms):",avg_a,avg_b,avg_c,avg_d,avg_e)

time interval for building model:1665768.8009738922 ms
time interval for indexing data :74.79166984558105 ms
average time interval for indexing data :0.0036458842666267457 ms
time interval for error correction :34.85679626464844 ms
average time interval for error correction :0.0016991711155624665 ms


KeyboardInterrupt: ignored

## Neural Networks

In [None]:
import numpy as np
temp=Z_train.reshape(1,-1)
T_train=np.zeros((temp.size, temp.max()+1))
T_train[np.arange(temp.size),temp] = 1
print(T_train)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [None]:
print(X_train.shape)

(61543, 300)


In [None]:
import tensorflow as tf
from tensorboard.plugins.hparams import api
from keras import models as md
from keras import layers as lr
import time
import numpy as np
from sklearn.metrics import classification_report
def test():
  t1=time.time()
  model = md.Sequential()
  model.add(lr.Dense(300,activation="relu"))
  # model.add(lr.Dense(4,activation="relu"))
  model.add(lr.Dense(128,activation="relu"))
  # model.add(lr.Dropout(0.2))
  model.add(lr.Dense(temp.max()+1,activation="softmax"))
  model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])#compile the model
  model.fit(X_train, T_train, epochs=16, batch_size=32)#fit the model
  t2=time.time()
  time_interval=t2-t1
  # devpre=tree.predict(X_dev)#.reshape(1,-1).tolist()[0]
  # print(classification_report(Y_dev,devpre))
  print("time interval for building model:"+str(time_interval*1000)+" ms")
  ret1=time_interval*1000
  t1=time.time()
  testpre=model.predict(X_test)#.reshape(1,-1).tolist()[0]
  testpre=np.argmax(testpre,axis=1)
  # print(testpre)
  t2=time.time()
  time_interval=t2-t1
  print("time interval for indexing data :"+str(time_interval*1000)+" ms")
  print("average time interval for indexing data :"+str(time_interval/len(testkeys)*1000)+" ms")
  # return
  ret2=time_interval*1000
  ret3=time_interval/len(testkeys)*1000
  t1=time.time()
  for i in range(0,len(testpre)):
    estimated_page=testpre[i]
    correct_res=testres[i]
    if correct_res in range(estimated_page*100,estimated_page*100+100):
      pass
    else:
      estimated_page=correct_res//100
    begin=estimated_page*100
    end=estimated_page*100+100
    while begin<end:
      middle=(begin+end)//2
      if middle==correct_res:
        estimated_loc=middle
        break
      elif middle<correct_res:
        begin=middle
      else:
        end=middle
  t2=time.time()
  time_interval=t2-t1
  print("time interval for error correction :"+str(time_interval*1000)+" ms")
  print("average time interval for error correction :"+str(time_interval/len(testkeys)*1000)+" ms")
  ret4=time_interval*1000
  ret5=time_interval/len(testkeys)*1000
  return (ret1,ret2,ret3,ret4,ret5)
avg_a=0.0
avg_b=0.0
avg_c=0.0
avg_d=0.0
avg_e=0.0
counting=20
for i in range(0,20):
  (a,b,c,d,e)=test()
  avg_a+=a
  avg_b+=b
  avg_c+=c
  avg_d+=d
  avg_e+=e
avg_a=avg_a/counting
avg_b=avg_b/counting
avg_c=avg_c/counting
avg_d=avg_d/counting
avg_e=avg_e/counting
print("average times (ms):",avg_a,avg_b,avg_c,avg_d,avg_e)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
time interval for building model:151715.08622169495 ms
time interval for indexing data :1276.4692306518555 ms
average time interval for indexing data :0.06222429709719486 ms
time interval for error correction :80.81841468811035 ms
average time interval for error correction :0.003939671184952245 ms
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
time interval for building model:150816.13111495972 ms
time interval for indexing data :1216.1238193511963 ms
average time interval for indexing data :0.05928262744229289 ms
time interval for error correction :77.34823226928711 ms
average time interval for error correction :0.003770509518830414 ms
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epo