# Retrieval

In [307]:
import numpy

In [308]:
def sumColumn(matrix):
    return numpy.sum(matrix, axis=0)
def sumRow(matrix):
    return numpy.sum(matrix, axis=1)
def emprical_language_model(matrix):
    with numpy.errstate(divide='ignore', invalid='ignore'):
        d_list = numpy.sum(matrix, axis=0)
        return numpy.round(matrix/d_list,3)
def collection_language_model(matrix):
    with numpy.errstate(divide='ignore', invalid='ignore'):
        return numpy.round(sumRow(matrix)/matrix.sum(),3)
def Jelinek_Mercer_smoothing(matrix,param):
    with numpy.errstate(divide='ignore', invalid='ignore'):
        emtx = emprical_language_model(matrix)
        cmtx = collection_language_model(matrix)
        return numpy.round((1-param)*emtx+param*cmtx,3)
    
def Dirichlet_smoothing(matrix,param):
    with numpy.errstate(divide='ignore', invalid='ignore'):
        cmtx = collection_language_model(matrix)
        return numpy.round((matrix+param*cmtx)/(sumColumn(matrix)+param),3)
    
def BM25(matrix, k1, b, avgdl, collection, N, length=numpy.matrix([])):
    d = None
    if length.all() :
         d=length   
    else :
        d = sumColumn(matrix)

    with numpy.errstate(divide='ignore', invalid='ignore'):
        return(numpy.multiply((matrix*(1+k1))/(matrix+k1*(1-b+b*(d/avgdl))),numpy.log10(N/collection)))
    
def query(queries,matrix,isSum=False):
    if isSum :
        matrix_one = numpy.zeros((1, matrix.shape[1]))
    else :
        matrix_one = numpy.ones((1, matrix.shape[1]))
    for q in queries:
        if isSum :
            matrix_one+=matrix[q,:]
        else :
            matrix_one*=matrix[q,:]
    
    return numpy.round(matrix_one,3)
    

# Usage

# Exercises for Lecture 2


<img src="images/lm_model_exercise.png">

In [309]:
#TODO
ROW = ["T1","T2","T3","T4","T5","T6"]
mtx = numpy.matrix([
    [ 0.,  1.,  0.,  0.,  1.],
    [ 0.,  1.,  0.,  0.,  1.],
    [ 3.,  2.,  2.,  0.,  1.],
    [ 0.,  0.,  1.,  1.,  0.],
    [ 0.,  0.,  1.,  1.,  1.],
    [ 2.,  1.,  0.,  2.,  0.]
])

In [310]:
print(mtx)
print(sumColumn(mtx))

[[ 0.  1.  0.  0.  1.]
 [ 0.  1.  0.  0.  1.]
 [ 3.  2.  2.  0.  1.]
 [ 0.  0.  1.  1.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 2.  1.  0.  2.  0.]]
[[ 5.  5.  4.  4.  4.]]


In [311]:
print("empirical language models")
emtx = emprical_language_model(mtx)
print(emtx)
print(sumColumn(emtx))

empirical language models
[[ 0.    0.2   0.    0.    0.25]
 [ 0.    0.2   0.    0.    0.25]
 [ 0.6   0.4   0.5   0.    0.25]
 [ 0.    0.    0.25  0.25  0.  ]
 [ 0.    0.    0.25  0.25  0.25]
 [ 0.4   0.2   0.    0.5   0.  ]]
[ 1.  1.  1.  1.  1.]


In [312]:
print("collecction language models")
cmtx = collection_language_model(mtx)
print(cmtx)
print(sumColumn(cmtx))

collecction language models
[[ 0.091]
 [ 0.091]
 [ 0.364]
 [ 0.091]
 [ 0.136]
 [ 0.227]]
[ 1.]


In [313]:
print("Jelinek-Mercer smoothing models")
param = 0.1
jmsmtx = Jelinek_Mercer_smoothing(mtx,param)
print(jmsmtx)

Jelinek-Mercer smoothing models
[[ 0.009  0.189  0.009  0.009  0.234]
 [ 0.009  0.189  0.009  0.009  0.234]
 [ 0.576  0.396  0.486  0.036  0.261]
 [ 0.009  0.009  0.234  0.234  0.009]
 [ 0.014  0.014  0.239  0.239  0.239]
 [ 0.383  0.203  0.023  0.473  0.023]]


In [314]:
print("\nQuery test T3")
querys = ["T3"]
query_to_idx = [ROW.index(q) for q in querys]
print(query(query_to_idx,jmsmtx))
print("\nQuery test T2 T1")
querys = ["T2","T1"]
query_to_idx = [ROW.index(q) for q in querys]
print(query(query_to_idx,jmsmtx))
print("\nQuery test T6")
querys = ["T6"]
query_to_idx = [ROW.index(q) for q in querys]
print(query(query_to_idx,jmsmtx))

print("\nQuery test T3 T1 T3 T2")
querys = ["T3","T1","T3","T2"]
query_to_idx = [ROW.index(q) for q in querys]
print(query(query_to_idx,jmsmtx))


Query test T3
[[ 0.576  0.396  0.486  0.036  0.261]]

Query test T2 T1
[[ 0.     0.036  0.     0.     0.055]]

Query test T6
[[ 0.383  0.203  0.023  0.473  0.023]]

Query test T3 T1 T3 T2
[[ 0.     0.006  0.     0.     0.004]]


# Trial Exam Case


<img src="images/lm_model_exam.png">

In [315]:
#TODO
COL = ["D1","D2","D3","D4"]
ROW = ["T1","T2","T3","T4","T5"]
mtx = numpy.matrix([
    [ 1.,  1.,  2.,  1.],
    [ 0.,  2.,  0.,  1.],
    [ 2.,  0.,  1.,  0.],
    [ 4.,  0.,  1.,  2.],
    [ 1.,  2.,  1.,  0.],
])
print(mtx)
print(sumColumn(mtx))
print("\nempirical language models")
emtx = emprical_language_model(mtx)
print(emtx)
print(sumColumn(emtx))
print("\ncollecction language models")
cmtx = collection_language_model(mtx)
print(cmtx)
print(sumColumn(cmtx))
print("\nDirichlet smoothing models")
param = 6
dsmtx = Dirichlet_smoothing(mtx,param)
print(dsmtx)
print("\nQuery test T5 T2")
querys = ["T5","T2"]
query_to_idx = [ROW.index(q) for q in querys]
print(query(query_to_idx,dsmtx))

[[ 1.  1.  2.  1.]
 [ 0.  2.  0.  1.]
 [ 2.  0.  1.  0.]
 [ 4.  0.  1.  2.]
 [ 1.  2.  1.  0.]]
[[ 8.  5.  5.  4.]]

empirical language models
[[ 0.125  0.2    0.4    0.25 ]
 [ 0.     0.4    0.     0.25 ]
 [ 0.25   0.     0.2    0.   ]
 [ 0.5    0.     0.2    0.5  ]
 [ 0.125  0.4    0.2    0.   ]]
[ 1.  1.  1.  1.]

collecction language models
[[ 0.227]
 [ 0.136]
 [ 0.136]
 [ 0.318]
 [ 0.182]]
[ 0.999]

Dirichlet smoothing models
[[ 0.169  0.215  0.306  0.236]
 [ 0.058  0.256  0.074  0.182]
 [ 0.201  0.074  0.165  0.082]
 [ 0.422  0.173  0.264  0.391]
 [ 0.149  0.281  0.19   0.109]]

Query test T5 T2
[[ 0.009  0.072  0.014  0.02 ]]


# Last year trial exam case

<img src="images/bm25.png"/>

In [316]:
#so much data to organize.
#be careful for setting the data scheme.
COL = ["D1","D2"]
ROW = ["T1","T2","T3","T4","T5","T6"]
mtx = numpy.matrix([
    [ 3.,  4.],
    [ 0.,  3.],
    [ 5.,  5.],
    [ 2.,  2.],
    [ 10.,  1.],
    [ 5.,  5.],
])
collection = numpy.matrix([
    [100],
    [50],
    [80],
    [93],
    [100],
    [25],
])
length = numpy.matrix([
    [25,20],
])

N=1000
k1=1.2
b=0.75
avgdl = 50

bm25mtx = BM25(mtx, k1, b, avgdl,collection, N, length=length)
print("BM25 scores")
print(bm25mtx)

querys = ["T2"]
query_to_idx = [ROW.index(q) for q in querys]
print("Query t2 score")
print(query(query_to_idx,bm25mtx))


print("Query t2,t2,t5, score")
querys = ["T2","T2","T5"]
query_to_idx = [ROW.index(q) for q in querys]

#isSum=True is important for this case. because it is not the probability, we need to add them.
print(query(query_to_idx,bm25mtx,isSum=True))

BM25 scores
[[ 1.76        1.88841202]
 [ 0.          2.34611966]
 [ 2.09843655  2.13180391]
 [ 1.65042728  1.70626881]
 [ 2.04651163  1.3253012 ]
 [ 3.06481042  3.11354415]]
Query t2 score
[[ 0.     2.346]]
Query t2,t2,t5, score
[[ 2.047  6.018]]
