In [1]:
import numpy as np
import pandas as pd
import jieba
import gensim
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from gensim.models import KeyedVectors

## Doc and Query Preprocess Functions

In [2]:
def remove_dash(x):
    return x.replace('-', '')

def remove_slash(x):
    return x.replace('/', ' ')

def remove_pound(x):
    return x.replace('#', ',')

def to_lower(x):
    return x.lower()

def slice(x):
    return x[1:]

def doc_preprocess(dir):
    # load description doc
    df = pd.read_csv(dir)

    # remove dash and slash in doc, and convert all upper to lower
    for col in df:
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(remove_dash)
        df[col] = df[col].apply(remove_slash)
        df[col] = df[col].apply(to_lower)

    # remove the pound in keyword
    df['Keyword'] = df['Keyword'].apply(remove_pound)
    df['Keyword'] = df['Keyword'].apply(slice)

    # return a well-process dataframe
    return df

def query_preprocess(query):
    # remove some specific punctuation from query
    query = remove_dash(query)
    query = remove_slash(query)
    query = remove_pound(query)
    query = to_lower(query)

    # remove stopwords from query
    with open('baidu_stopwords.txt') as file:
        stopwords = set(file.read().splitlines())
    seg_query = jieba.cut_for_search(query)
    final_query = []
    for seg in seg_query:
        if seg not in stopwords:
            final_query.append(seg)

    #return ','.join(final_query)
    return final_query

## Preprocess Description Doc

In [3]:
with open('baidu_stopwords.txt') as file:
    stopword = file.read().splitlines()
docs = doc_preprocess('description_doc.csv')
doc_list = []
for idx, val in docs.iterrows():
    doc_list.append(' '.join(val))
seg_doc = [jieba.cut_for_search(doc) for doc in doc_list]
processed_doc = []
tmp = []
for seg in seg_doc:
    for word in seg:
        if word not in stopword:
            tmp.append(word)
    processed_doc.append(tmp)
    tmp = []
print(processed_doc)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/1h/wb9s9yjd765clfxzbgytj2y80000gn/T/jieba.cache
Loading model cost 1.012 seconds.
Prefix dict has been built succesfully.


[['h1b', '期间', '多位', '雇主', '工作', '工作', '雇主', '额外', 'lisa', 'smith', 'h1b', '雇主', '多位', '雇主', 'part', 'time', 'concurrent', 'h1b', 'file', '全新', 'h1b', 'petition', '做', 'concurrent', 'h1b', '申请', '新', 'h1b', '职位', '学历', '相关', '；', '雇主', '薪水', '最低', '标准', '最低标准', 'part', 'time', '职位', '薪水', 'hourly', 'wage', '最低', '标准', '最低标准', '雇主', '工作', '职位', '全职', '兼职'], ['h1b', '期间', 'layoff', 'lisa', 'smith', 'h1b', '离职', '解雇', 'lay', 'grace', 'period', 'h1b', '期间', 'layoff', '建议', '第一', '找', '工作', '美', '合法', '身份', '\n', '雇主', 'revoke', 'h1b', '建议', '身份', '转换', '转换', '换成', '转换成', 'b1', 'b2', 'f1', '；', '\n', 'revoke', '一张', 'paycheck', '日期', '60', '天', '之内', '找', '新', '雇主', '帮', '提交', 'h1b', 'petition', 'request', '移民', '移民局', '收到', '新', '雇主', '工作', '；', '\n', '雇主', 'revoke', '60', '天内', '没', '找到', '工作', '60', '天', '结束', '申请', '转换', '身份', '转换', 'b1', 'b2', 'f1'], ['h1b', '换', '工作', 'transfer', '做', 'lisa', 'smith', 'h1b', '换', '工作', 'h1b', 'transfer', '跳槽', '申请', 'h1b', 'transfer', 'original', 'pet

## Load Pretrain Word2Vec Model

In [8]:
pretrain_model = gensim.models.Word2Vec.load('word2vec_wiki.model')
print(pretrain_model.wv.vectors.shape)

(1994343, 500)


## Incremental Training to Update the Pretrain Word2Vec Model

In [9]:
pretrain_model.build_vocab(processed_doc, update=True)
pretrain_model.train(processed_doc, total_examples=len(processed_doc), epochs=20)
print(pretrain_model.wv.vectors.shape)

(1994450, 500)


## Save the Retrained Model in Key-Vector Format

In [20]:
word_vec = pretrain_model.wv
path = os.path.dirname(os.path.abspath(os.getcwd()))
word_vec.save(path+'/OneMinLaw_Word2Vec/word_vectors')

In [4]:
path = os.path.dirname(os.path.abspath(os.getcwd()))
word_vec = KeyedVectors.load(path+'/OneMinLaw_Word2Vec/word_vectors', mmap='r')
print(word_vec.vectors.shape)

(1994450, 500)


## Doc Vectorizer without TF-IDF Weight

In [5]:
def doc_vectorizer(doc, model):
    doc_vec = np.zeros(500)
    num_words = 0
    for w in doc:
        try:
            doc_vec = np.add(doc_vec, model[w])
            num_words += 1
        except:
            pass
    return doc_vec / num_words
 
vectors = []
for doc in processed_doc:
    vectors.append(doc_vectorizer(doc, word_vec))

print(vectors)

[array([ 4.42609023e-01, -4.00706381e-01,  2.84740240e-02,  5.21900599e-01,
       -1.97023127e-01,  1.75389003e-01, -5.06214551e-01,  5.55975129e-02,
        9.37595242e-02,  1.23291800e-01,  3.60451508e-01,  1.94247176e-01,
        1.04802269e-01, -1.11803418e-01,  3.97885580e-01, -1.10616443e-02,
        6.22566184e-01,  2.70519842e-01, -2.44556479e-01,  8.26807046e-02,
       -8.37828391e-01,  6.40737339e-02, -5.19256746e-02, -7.59410484e-01,
        1.38738351e-01,  6.16186646e-01, -4.08942155e-01,  5.64172851e-02,
       -2.84915531e-01, -2.44063220e-01,  2.38278315e-01, -2.69316764e-01,
       -3.88587527e-01,  9.31355004e-02, -1.58317782e-02,  8.54378389e-01,
        5.74019063e-01, -1.92125563e-01,  2.86270594e-02,  1.68122235e-01,
        7.42695585e-01, -3.73793150e-01,  5.81516470e-01,  1.11194526e-01,
        5.52623935e-01, -4.48964868e-01, -2.32936964e-01, -5.95441205e-01,
       -2.25218287e-01,  4.03127204e-02,  2.70418628e-01, -1.08631960e-02,
        5.53542543e-01, 

In [38]:
# save the doc vec
np.save('doc_vec.npy', vectors)
doc_vec = list(np.load('doc_vec.npy'))

[array([ 4.42609023e-01, -4.00706381e-01,  2.84740240e-02,  5.21900599e-01,
       -1.97023127e-01,  1.75389003e-01, -5.06214551e-01,  5.55975129e-02,
        9.37595242e-02,  1.23291800e-01,  3.60451508e-01,  1.94247176e-01,
        1.04802269e-01, -1.11803418e-01,  3.97885580e-01, -1.10616443e-02,
        6.22566184e-01,  2.70519842e-01, -2.44556479e-01,  8.26807046e-02,
       -8.37828391e-01,  6.40737339e-02, -5.19256746e-02, -7.59410484e-01,
        1.38738351e-01,  6.16186646e-01, -4.08942155e-01,  5.64172851e-02,
       -2.84915531e-01, -2.44063220e-01,  2.38278315e-01, -2.69316764e-01,
       -3.88587527e-01,  9.31355004e-02, -1.58317782e-02,  8.54378389e-01,
        5.74019063e-01, -1.92125563e-01,  2.86270594e-02,  1.68122235e-01,
        7.42695585e-01, -3.73793150e-01,  5.81516470e-01,  1.11194526e-01,
        5.52623935e-01, -4.48964868e-01, -2.32936964e-01, -5.95441205e-01,
       -2.25218287e-01,  4.03127204e-02,  2.70418628e-01, -1.08631960e-02,
        5.53542543e-01, 

## Search Test on the Model without TF-IDF Weight

In [6]:
query = input()
query = query_preprocess(query)
# convert query to vector
query_vector = doc_vectorizer(query, word_vec)
res = []
for i in range(len(vectors)):
    res.append([i, 1 - spatial.distance.cosine(query_vector, vectors[i])])
res = sorted(res, key=lambda x: x[1], reverse=True)
for i in range(len(res)):
    print(docs.loc[res[i][0], ['Title']].to_string())

H1B回国
Title    h1b pending期间是否可以回国？
Title    opt期间可否回国
Title    j1身份一定要回国服役两年吗？
Title    h1b transfer被deny后还有没有grace period
Title    h1b期间离职停薪的风险
Title    还没拿到硕士学位怎么办？h1b可以按照master cap抽签吗？
Title    visa 过期了或者opt grace period即将到期，因疫情无法回国，但也没办法做正...
Title    h1b期间换工作需要做些什么？
Title    opt期间可否同时为两位雇主工作？
Title    拿到h1b才能办绿卡吗？
Title    关于opt extension的申请
Title    拿h1b之后可以在美国待多久？
Title    没有h1b可以申请工作绿卡吗
Title    eb2 eb3 perm的流程以及时间安排
Title    如果h1b批准了但是还没有转换身份可以办绿卡吗？
Title    拿到140 approval notice有什么好处
Title    h1b期间可以为两位雇主工作吗？
Title    没有毕业可以抽h1b吗？
Title    一个人一辈子只能有一次opt吗？
Title    2020年h1b怎么准备资料？注册前准备还是抽签后？
Title    h1b抽中之后我需要做什么？
Title    所有美国硕士学位都可以按master抽签吗？
Title    h1b期间被layoff怎么办？
Title    可不可以有多个雇主为同时提交h1b transfer的申请？
Title    现在面签被取消无法进行境外转换身份该怎么办？
Title    拥有l1 visa多久后可以申请eb1c？
Title    b1 b2签证快到期但因疫情航班被取消，已经延期过一次还可以再延期吗？
Title    h1b换工作transfer需要做什么呢？
Title    我去年的h1b被deny了，目前正在appeal。我今年还能参加抽签吗？如果抽中的话appe...
Title    我去年的h1b还在pending，还可以参加今年的抽签吗？会不会有duplicate pet...
Title    h1

## Construct the TF-IDF Vectors Dictionary for Docs

In [7]:
new_processed_doc = [' '.join(processed_doc[i]) for i in range(len(processed_doc))]
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 3)).fit(new_processed_doc)
tfidf_vectors = tfidf.transform(new_processed_doc)

In [8]:
feature_names = tfidf.get_feature_names()
tfidf_dic = {}
idx = 0
for text_tfidf in tfidf_vectors.toarray():
    tfidf_dic[idx] = {}
    dic = {}
    for i in range(len(text_tfidf)):
        dic[feature_names[i]] = text_tfidf[i]
    tfidf_dic[idx] = dic
    idx += 1
print(tfidf_dic[0])

{'10': 0.0, '10 h1b': 0.0, '10 h1b 正式': 0.0, '10 时间': 0.0, '10 时间 process': 0.0, '10 移民': 0.0, '10 移民 移民局': 0.0, '12': 0.0, '12 有用': 0.0, '12 有用 没有用完': 0.0, '13': 0.0, '13 20': 0.0, '13 20 之间': 0.0, '14': 0.0, '140': 0.0, '140 approval': 0.0, '140 approval notice': 0.0, '140 approval 无限': 0.0, '140 priority': 0.0, '140 priority date': 0.0, '140 renew': 0.0, '140 renew h1b': 0.0, '15': 0.0, '15 会出': 0.0, '15 会出 加急': 0.0, '20': 0.0, '20 40': 0.0, '20 40 小时': 0.0, '20 之间': 0.0, '20 之间 h1b': 0.0, '20 小时': 0.0, '20 小时 移民': 0.0, '20 小时 雇主': 0.0, '20 日后': 0.0, '20 日后 加急': 0.0, '20 移民': 0.0, '20 移民 移民局': 0.0, '2020': 0.0, '2020 h1b': 0.0, '2020 h1b 注册': 0.0, '2020 h1b 资料': 0.0, '2020 学位': 0.0, '2020 学位 证书': 0.0, '2020 抽签': 0.0, '2020 抽签 制度': 0.0, '2year': 0.0, '2year home': 0.0, '2year home country': 0.0, '30': 0.0, '30 抽中': 0.0, '30 抽中 h1b': 0.0, '30 来说': 0.0, '30 来说 一般来说': 0.0, '30 雇主': 0.0, '30 雇主 递交': 0.0, '31': 0.0, '31 线上': 0.0, '31 线上 公布': 0.0, '40': 0.0, '40 小时': 0.0, '40 小时 工作': 0.0, 

In [46]:
# save tfidf vectorizer
import pickle
pickle.dump(tfidf, open("tfidf_vectorizer.pickle", "wb"))
tfidf_vectorizer = pickle.load(open("tfidf_vectorizer.pickle", "rb"))

False


In [31]:
# save the TF-IDF dic in local
np.save('tf_idf.npy', tfidf_dic) 

## Doc Vectorizer with TF-IDF Weight

In [9]:
def weighted_doc_vectorizer(doc, model, idx):
    doc_vec = np.zeros(500)
    num_words = 0
    for w in doc:
        try:
            weighted_vector = model[w] * tfidf_dic[idx][w]
            doc_vec = np.add(doc_vec, weighted_vector)
            num_words += 1
        except:
            pass
    return doc_vec / num_words
 
weighted_vectors = []
idx = 0
for doc in processed_doc:
    weighted_vectors.append(weighted_doc_vectorizer(doc, word_vec, idx))
    idx += 1
print(weighted_vectors)

[array([ 4.98573753e-02, -6.05849075e-02,  1.62334374e-03,  6.61933813e-02,
       -2.48551122e-02,  3.39337636e-02, -9.41701855e-02, -4.89323643e-04,
        1.47168692e-02,  1.09970438e-02,  7.17864722e-02,  1.88355621e-02,
        2.59617176e-02, -1.03812887e-02,  4.63413813e-02,  1.12518317e-02,
        9.19790836e-02,  4.30400020e-02, -2.96016012e-02, -3.91178344e-03,
       -1.31015172e-01,  2.60826725e-02, -2.62187201e-02, -1.14050672e-01,
        3.62345964e-02,  9.69902847e-02, -5.45122729e-02, -5.15551121e-03,
       -5.00245714e-02, -3.66729873e-02,  3.27628398e-02, -3.71227411e-02,
       -5.06268612e-02,  1.29078614e-02,  1.03119196e-02,  1.40900188e-01,
        8.26877119e-02, -2.86893935e-02, -3.59209275e-03,  3.35226397e-02,
        1.08664325e-01, -6.38611147e-02,  9.13121341e-02,  1.10978367e-02,
        7.51845737e-02, -7.15617544e-02, -4.61287790e-02, -8.65940578e-02,
       -3.92279224e-02,  1.33587488e-03,  3.60298446e-02,  3.24333680e-03,
        8.38538492e-02, 

In [42]:
# save weighted doc vectors
np.save('weighted_doc_vec.npy', weighted_vectors)

## Query Vectorizer with TF-IDF Weight

In [10]:
def query_vectorizer(doc, model, query_tfidf_dic):
    query_vec = np.zeros(500)
    num_words = 0
    for w in doc:
        try:
            weighted_vector = model[w] * query_tfidf_dic[w]
            query_vec = np.add(query_vec, model[w])
            num_words += 1
        except:
            pass
    return query_vec / num_words

## Search Test on Model with TF-IDF Weight

In [11]:
weighted_query = input()
weighted_query = query_preprocess(weighted_query)

# construct tfidf for the query
query2 = [' '.join(weighted_query)]
query_tfidf_vector = tfidf.transform(query2)
query_tfidf_val = query_tfidf_vector.toarray()[0]
query_tfidf_dic = {}
idx = 0
for feature in tfidf.get_feature_names():
    query_tfidf_dic[feature] = query_tfidf_val[idx]
    idx += 1

# get the weighted vector of query
weighted_query_vector = query_vectorizer(weighted_query, word_vec, query_tfidf_dic)

res = []
for i in range(len(weighted_vectors)):
    res.append([i, 1 - spatial.distance.cosine(weighted_query_vector, weighted_vectors[i])])
res = sorted(res, key=lambda x: x[1], reverse=True)
for i in range(len(res)):
    print(docs.loc[res[i][0], ['Title']].to_string())

H1B回国
Title    opt期间可否回国
Title    h1b pending期间是否可以回国？
Title    j1身份一定要回国服役两年吗？
Title    h1b transfer被deny后还有没有grace period
Title    h1b期间换工作需要做些什么？
Title    h1b期间离职停薪的风险
Title    opt期间可否同时为两位雇主工作？
Title    拿h1b之后可以在美国待多久？
Title    没有毕业可以抽h1b吗？
Title    visa 过期了或者opt grace period即将到期，因疫情无法回国，但也没办法做正...
Title    关于opt extension的申请
Title    eb2 eb3 perm的流程以及时间安排
Title    一个人一辈子只能有一次opt吗？
Title    没有h1b可以申请工作绿卡吗
Title    还没拿到硕士学位怎么办？h1b可以按照master cap抽签吗？
Title    h1b抽中之后我需要做什么？
Title    2020年h1b怎么准备资料？注册前准备还是抽签后？
Title    h1b期间可以为两位雇主工作吗？
Title    所有美国硕士学位都可以按master抽签吗？
Title    h1b换工作transfer需要做什么呢？
Title    现在面签被取消无法进行境外转换身份该怎么办？
Title    拿到h1b才能办绿卡吗？
Title    拥有l1 visa多久后可以申请eb1c？
Title    何时可以正式开始绿卡排期？
Title    我去年的h1b被deny了，目前正在appeal。我今年还能参加抽签吗？如果抽中的话appe...
Title    可不可以有多个雇主为同时提交h1b transfer的申请？
Title    申请stem opt需要满足什么条件
Title    h1b 申请被deny还有这些方法可以逆转局势？
Title    我去年的h1b还在pending，还可以参加今年的抽签吗？会不会有duplicate pet...
Title    h1b期间被layoff怎么办？
Title    可以同时申请eb1a、eb2、eb3吗
Title    b1 

## Functions to Calculate NDCG

In [12]:
relevance = pd.read_csv('relevance.csv')
relevance.head(5)

Unnamed: 0.1,Unnamed: 0,H-1B期间可以为多位雇主工作吗？对工作和雇主有额外要求吗？,H-1B期间被Layoff怎么办？,H-1B换工作transfer需要做什么呢？,现在面签被取消无法进行境外转换身份该怎么办？,Visa 过期了或者opt grace period即将到期，因疫情无法回国，但也没办法做正常身份延期怎么办？\n,B1/B2签证快到期但因疫情航班被取消，已经延期过一次还可以再延期吗？,2020年 H-1B注册重要的时间截点,还没拿到硕士学位怎么办？H-1B可以按照master cap抽签吗？,所有美国硕士学位都可以按master抽签吗？
0,H-1B期间可以为多位雇主工作吗？对工作和雇主有额外要求吗？,5,1,4,1,0,0,0,1,1
1,H-1B期间被Layoff怎么办？,1,5,4,4,2,0,0,1,1
2,H-1B换工作transfer需要做什么呢？,4,4,5,3,0,0,0,1,1
3,现在面签被取消无法进行境外转换身份该怎么办？,0,2,3,5,0,0,2,1,1
4,Visa 过期了或者opt grace period即将到期，因疫情无法回国，但也没办法做正...,0,0,0,0,5,0,0,0,0


In [13]:
# get the list of test query
test_query = relevance.columns.values[1:]
print(test_query)

['H-1B期间可以为多位雇主工作吗？对工作和雇主有额外要求吗？' 'H-1B期间被Layoff怎么办？'
 'H-1B换工作transfer需要做什么呢？' '现在面签被取消无法进行境外转换身份该怎么办？'
 'Visa 过期了或者opt grace period即将到期，因疫情无法回国，但也没办法做正常身份延期怎么办？\n'
 'B1/B2签证快到期但因疫情航班被取消，已经延期过一次还可以再延期吗？' '2020年 H-1B注册重要的时间截点'
 '还没拿到硕士学位怎么办？H-1B可以按照master cap抽签吗？' '所有美国硕士学位都可以按master抽签吗？']


In [14]:
# get the true relevance score of each query to each doc
true_score = []
for col in relevance:
    true_score.append(relevance[col].tolist())
true_score = true_score[1:]
print(true_score)

[[5, 1, 4, 0, 0, 0, 0, 0, 0, 4, 1, 2, 5, 5, 5, 4, 0, 0, 0, 4, 1, 5, 0, 4, 4, 0, 4, 0, 0, 5, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 1], [1, 5, 4, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 3, 1, 0, 0, 3, 4, 3, 3, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0], [4, 4, 5, 3, 0, 0, 1, 1, 1, 1, 1, 2, 5, 5, 5, 5, 1, 1, 0, 0, 4, 5, 0, 5, 5, 0, 5, 0, 0, 4, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 5], [1, 4, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 5, 3, 3, 3, 4, 1, 1, 1, 1, 1, 2, 2, 2, 4, 2, 4, 5, 3, 1, 2, 2, 0, 0, 4, 0, 4, 0, 4, 0, 0, 5, 0, 0, 0, 0, 0, 3, 2, 3], [1, 1, 1, 1, 0, 0, 3, 5, 5, 4, 4, 3, 2, 

In [15]:
import math
def calculate_dcg(items):
    dcg = 0
    i = 0
    for item in items:
        i += 1
        dcg += item / math.log(i + 1, 2)
    return dcg

## Calculate the NDCG for Model without Weight

In [16]:
query_list = []
for query in test_query:
    query_list.append(query_preprocess(query))
print(query_list)

# convert query_list to vector_list
query_vector_list = []
for query in query_list:
    query_vector_list.append(doc_vectorizer(query, word_vec))
print(query_vector_list)

# get the search score of each query
res = []
for i in range(len(query_vector_list)):
    tmp = []
    for j in range(len(vectors)):
        tmp.append([j, 1 - spatial.distance.cosine(query_vector_list[i], vectors[j])])
    res.append(tmp)
print(res)

[['h1b', '期间', '多位', '雇主', '工作', '工作', '雇主', '额外'], ['h1b', '期间', 'layoff'], ['h1b', '换', '工作', 'transfer', '做'], ['面签', '取消', '境外', '转换', '身份'], ['visa', '过期', 'opt', 'grace', 'period', '即将', '到期', '疫情', '回国', '没', '办法', '做', '身份', '延期', '\n'], ['b1', 'b2', '签证', '快', '到期', '疫情', '航班', '取消', '延期', '还', '再', '延期'], ['2020', '年', 'h1b', '注册', '时间', '截点'], ['还', '拿到', '没拿到', '硕士', '学位', '硕士学位', 'h1b', 'master', 'cap', '抽签'], ['美国', '硕士', '学位', '硕士学位', '都', 'master', '抽签']]
[array([ 0.70463433, -0.4931145 , -0.19750405,  0.99642293, -0.0863217 ,
        0.68963629, -0.6801001 ,  0.32185923,  0.04664126,  0.46925446,
        0.63607522, -0.17531798, -0.0635747 , -0.43654217,  0.49302753,
        0.1792613 ,  1.08362304,  0.08969322, -1.0085771 , -0.02123862,
       -1.66581315,  0.56562347, -0.33834672, -1.11758514,  0.51556768,
        0.42339193, -0.81851207,  0.52361872, -0.43048535,  0.0776918 ,
        0.3136959 , -0.59459402, -1.15682943,  0.24010787, -0.2685976 ,
        1.10241271,

In [17]:
# sort the result of search of each query
for i in range(len(res)):
    res[i] = sorted(res[i], key=lambda x: x[1], reverse=True)

# assign the score to the res list
rel_score = []
for i in range(len(res)):
    tmp = []
    for j in range(len(res[i])):
        tmp.append(true_score[i][res[i][j][0]])
    rel_score.append(tmp)
print(rel_score)

[[5, 0, 5, 4, 1, 0, 0, 2, 0, 4, 0, 5, 5, 0, 4, 0, 4, 1, 0, 0, 0, 4, 5, 1, 1, 0, 0, 4, 4, 0, 0, 0, 0, 5, 4, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 3, 0, 4, 5, 0, 0, 0, 3, 0, 0, 3, 0, 0, 1, 3, 0, 1, 2, 0, 0, 0, 1, 2, 2, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], [5, 5, 4, 0, 0, 0, 4, 3, 0, 5, 5, 0, 0, 0, 5, 5, 4, 0, 5, 0, 5, 0, 0, 1, 1, 1, 5, 4, 4, 1, 0, 0, 2, 0, 0, 0, 0, 3, 0, 1, 5, 1, 5, 1, 1], [0, 4, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 0, 4, 2, 0, 2, 0, 0, 3, 4, 3, 4, 1, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 1, 2, 1, 4, 5, 0, 1, 0, 5, 4, 0, 3, 3, 1, 2, 0, 3, 0, 4, 3, 1], [5, 5, 5, 0, 1, 1, 0, 4, 1, 3, 2, 0, 1, 

In [18]:
# calculate dcg, idcg and ndcg
dcg = []
idcg = []

for i in range(len(rel_score)):
    dcg.append(calculate_dcg(rel_score[i]))

perfect_score = []
for i in range(len(rel_score)):
    perfect_score.append(sorted(rel_score[i], reverse=True))

for i in range(len(perfect_score)):
    idcg.append(calculate_dcg(perfect_score[i]))

ndcg_list = []
for i in range(len(dcg)):
    ndcg_list.append(dcg[i]/idcg[i])

ndcg = sum(ndcg_list) / len(ndcg_list)

print('NDCG for each query:' + '\n' + str(ndcg_list) + '\n')
print ('Average NDCG: %f' % ndcg)

NDCG for each query:
[0.8107749303397268, 0.6109041610379081, 0.8530497322316617, 0.5657756472244498, 0.9220433016555237, 1.0, 0.7989636005809563, 0.8579625651457152, 0.8651277295929962]

Average NDCG: 0.809400


## Calculate NDCG for Model with Weight

In [19]:
query_list = []
for query in test_query:
    query_list.append(query_preprocess(query))

tfidf_query_list = []
for i in range(len(query_list)):
    query2 = [' '.join(query_list[i])]
    query_tfidf_vector = tfidf.transform(query2)
    query_tfidf_val = query_tfidf_vector.toarray()[0]
    query_tfidf_dic = {}
    idx = 0
    for feature in tfidf.get_feature_names():
        query_tfidf_dic[feature] = query_tfidf_val[idx]
        idx += 1
    tfidf_query_list.append(query_tfidf_dic)

weighted_query_vector = []
for i in range(len(query_list)):
    weighted_query_vector.append(query_vectorizer(query_list[i], word_vec, tfidf_query_list[i]))

res = []
for i in range(len(weighted_query_vector)):
    tmp = []
    for j in range(len(weighted_vectors)):
        tmp.append([j, 1 - spatial.distance.cosine(weighted_query_vector[i], weighted_vectors[j])])
    res.append(tmp)
print(res[0])

[[0, 0.8806482610141512], [1, 0.7623445520952161], [2, 0.7882673855577816], [3, 0.5878093031995743], [4, 0.506624959808969], [5, 0.5080593690019516], [6, 0.5237019612440816], [7, 0.48396227499825484], [8, 0.6170328316737758], [9, 0.5072901718243485], [10, 0.6372116718459692], [11, 0.46281635410396604], [12, 0.6156029165378835], [13, 0.6954326005171966], [14, 0.6681311036342197], [15, 0.5536174277555705], [16, 0.6936025972321613], [17, 0.6877735980140501], [18, 0.6195117744943349], [19, 0.6484928029777445], [20, 0.5812048700214818], [21, 0.8343143348579548], [22, 0.4643226253876369], [23, 0.6046983796740782], [24, 0.8904272813958697], [25, 0.6067174401187281], [26, 0.6565588573785829], [27, 0.8259478244765246], [28, 0.6846880334902696], [29, 0.6848609860767331], [30, 0.6909820054334131], [31, 0.724516465069602], [32, 0.6210428786903388], [33, 0.4798566991551464], [34, 0.7215316628969322], [35, 0.8627047525072844], [36, 0.5547487324899207], [37, 0.7712652504028809], [38, 0.61223692485929

In [20]:
# sort the result of search of each query
for i in range(len(res)):
    res[i] = sorted(res[i], key=lambda x: x[1], reverse=True)

# assign the score to the res list
rel_score = []
for i in range(len(res)):
    tmp = []
    for j in range(len(res[i])):
        tmp.append(true_score[i][res[i][j][0]])
    rel_score.append(tmp)
print(rel_score)

[[4, 5, 0, 5, 0, 4, 0, 1, 2, 4, 0, 1, 5, 0, 0, 0, 5, 0, 5, 0, 4, 4, 1, 0, 0, 0, 5, 0, 0, 4, 0, 0, 1, 0, 1, 4, 0, 0, 0, 4, 0, 0, 0, 0, 2], [0, 0, 0, 0, 3, 4, 0, 0, 0, 5, 0, 3, 0, 5, 0, 0, 2, 2, 4, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0], [5, 5, 0, 5, 4, 0, 4, 4, 0, 0, 5, 5, 0, 5, 5, 1, 1, 0, 4, 1, 0, 0, 3, 0, 0, 0, 5, 5, 0, 5, 0, 1, 3, 0, 4, 5, 1, 0, 1, 0, 0, 0, 2, 1, 1], [4, 0, 0, 0, 0, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 0, 0, 2, 4, 0, 3, 2, 0, 4, 2, 3, 2, 0, 0, 0, 4, 0, 1, 2, 0, 0, 5, 0, 4, 0, 0, 2, 1, 0, 2, 3, 1, 1, 4, 5, 1, 0, 3, 0, 2, 3, 1, 3, 4], [5, 5, 5, 0, 1, 4, 1, 0, 0, 0, 1, 2, 0, 

In [21]:
# calculate dcg, idcg and ndcg
dcg = []
idcg = []

for i in range(len(rel_score)):
    dcg.append(calculate_dcg(rel_score[i]))

perfect_score = []
for i in range(len(rel_score)):
    perfect_score.append(sorted(rel_score[i], reverse=True))

for i in range(len(perfect_score)):
    idcg.append(calculate_dcg(perfect_score[i]))

ndcg_list = []
for i in range(len(dcg)):
    ndcg_list.append(dcg[i]/idcg[i])

ndcg = sum(ndcg_list) / len(ndcg_list)

print('NDCG for each query:' + '\n' + str(ndcg_list) + '\n')
print ('Average NDCG: %f' % ndcg)

NDCG for each query:
[0.8193961333366236, 0.5646948752027778, 0.8738693009514846, 0.7023817133826565, 0.884797287855885, 1.0, 0.7798006022456796, 0.8572712343175007, 0.8485900039353353]

Average NDCG: 0.814533
