In [None]:
import numpy as np
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from tqdm import tqdm
import operator
import fasttext
import tensorflow as tf
import sentencepiece
import tensorflow_text
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import os
import re
import time
from nltk.tokenize import word_tokenize

In [None]:
queries = {}
with open('norm_queries.tsv', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip('\n').split('\t')
        if line[0] == '':
            line.pop(0)
        queries[line[0]] = line[1]

titles= {}
with open("norm_titles.tsv" ,'r', encoding='utf-8') as f:
    for line in f.readlines():
        line=line.split('\t')
        titles[line[0]]= line[1][:-1]

В словарях сначала все query_id для трейна, потом для теста, как они идут по порядку в train.marks.tsv, sample.csv

In [None]:
qid2docid = {}
qid2query = {}
docid2title = {}

In [None]:
train_queries_unq = []
with open("train.marks.tsv", 'r', encoding='utf-8') as f:
    for line in f.readlines():
        line=line.strip('\n').split('\t')
        if line[1] in titles.keys():
            if line[0] not in qid2docid:
                qid2docid[line[0]] = []
            qid2docid[line[0]].append(line[1])
            docid2title[line[1]] = titles[line[1]]
            if line[0] not in train_queries_unq:
                qid2query[line[0]] = queries[line[0]]
                train_queries_unq.append(line[0])

In [None]:
test_queries_unq = []
with open("sample.csv", 'r', encoding='utf-8') as f:
    f.readline()
    for line in f.readlines():
        line=line.strip('\n').split(',')
        if line[1] in titles.keys():
            if line[0] not in qid2docid:
                qid2docid[line[0]] = []
            qid2docid[line[0]].append(line[1])
            docid2title[line[1]] = titles[line[1]]
            if line[0] not in test_queries_unq:
                qid2query[line[0]] = queries[line[0]]
                test_queries_unq.append(line[0])

In [None]:
all_queries = list(qid2docid.keys())

### large USE embeddings

Посчитаем косинусное рассстояние между эмбеддингами large use для заголовков и запросов

In [None]:
def cos(sp1,sp2):
    return np.float64(np.dot(sp1,sp2.T)/np.linalg.norm(sp1)/np.linalg.norm(sp2))

In [None]:
# large USE

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
def use_score(q, out_dir):

    global qid2query
    global docid2title
    global embed
    
    
    raw_titles = []
    docs = []
    for doc in qid2docid[q]:
        try:
            raw_titles.append(docid2title[doc])
            docs.append(doc)
        except:
            print(q, doc)
            
    q_emb = embed([qid2query[q]]).numpy()
    t_embs = embed(raw_titles).numpy()


    with open(out_dir+'/{}.txt'.format(q),'w') as fout:
        for i,doc in enumerate(docs):
            fout.write(str(q) + '\t' +str(doc) + '\t' + str(cos(q_emb,t_embs[i])) + '\n')

    del t_embs
    del q_emb
    del raw_titles
    del docs


In [None]:
out_dir = 'use'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        use_score(q,out_dir)

### Doc2vec embeddings

Тоже на заголовках

In [None]:
def doc2vec_score(key, outdir):

    global qid2docid
    global docid2title
    global qid2query

    max_epochs = 100
    
    docs = []
    titles = []
    
    for docid in qid2docid[key]:
        docs.append(docid)
        titles.append(docid2title[docid])
    tagged_docs = [TaggedDocument(word_tokenize(title),[docid]) for docid, title in zip(docs, titles)]
    tagged_docs.append(TaggedDocument(qid2query[key],[9999999]))
    
    d2v = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample = 0)
    d2v.build_vocab(tagged_docs)
    
    for epoch in range(max_epochs):
        d2v.train(tagged_docs,
        total_examples=d2v.corpus_count,
        epochs=dv.iter)
        d2v.alpha -= 0.0002
        d2v.min_alpha = dv.alpha

    with open(outdir+'/{}.txt'.format(key),'w') as fout:
        for docid in qid2docid[key]:
            fout.write(key + '\t' + docid + '\t' + str(cos(d2v.docvecs[9999999],d2v.docvecs[docid]))+'\n')
    return key


In [None]:
out_dir = 'd2v'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in tqdm(all_queries[:3]):
    if q + '.txt' not in os.listdir(out_dir):
        doc2vec_score(q,out_dir)

### FastText embeddings

Используем модель для русского языка

In [None]:
import fasttext.util

fasttext.util.download_model('ru', if_exists='ignore')
ft_model = fasttext.load_model('cc.ru.300.bin')
def fasttext_score(q, out_dir):
    global qid2docid
    global qid2query
    global docid2title
    global ft_model
    cur_titles = {}
    for doc in qid2docid[q]:
        try:
            cur_titles[doc] = ft_model.get_word_vector(docid2title[doc])
        except:
            continue
    cur_titles['q'] = ft_model.get_word_vector(qid2query[q])
    with open(out_dir + '/{}.txt'.format(q),'w') as fout:
        for outdoc in cur_titles.keys():
            if outdoc != 'q':
                fout.write(str(outdoc)+ '\t'+str(cos(cur_titles[outdoc],cur_titles['q'])) + '\n')
    return


In [None]:
out_dir = 'fasttext'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in tqdm(all_queries[:3]):
    if q + '.txt' not in os.listdir(out_dir):
        fasttext_score(q,out_dir)

### Схожесть d2v эмбеддингов запросов

Будет применять для сглаживания

In [None]:
queries_tagged = []
for qid in qid2query.keys():
    queries_tagged.append(TaggedDocument(qid2query[qid],[qid]))

In [None]:
max_epochs = 100
alpha = 0.0025
d2v = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample = 0)
d2v.build_vocab(queries_tagged)
for epoch in range(max_epochs):
    d2v.train(queries_tagged,
    total_examples=d2v.corpus_count,
    epochs=d2v.iter)
    d2v.alpha -= 0.0002
    d2v.min_alpha = d2v.alpha

In [None]:
def get_similar_q(qid):
    global d2v
    global all_queries
    
    distances = {}
    for q in all_queries:
        if q != qid:
            distances[q] = cos(d2v.docvecs[qid],d2v.docvecs[q])
    distances_sorted = sorted(distances.items(), key = operator.itemgetter(1), reverse = True)
    sim_top = []
    for i in range(15):
        sim_top.append(distances_sorted[i][0])

    line = '\t'.join([qid]+sim_top)+'\n'
    
    return line
    

In [None]:
lines = []
for q in all_queries[:3]:
    lines.append(get_similar_q(q))

In [None]:
file_sim = open('similar_queries.txt', 'w')
file_sim.writelines(lines)
file_sim.close()