In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import os
from nltk.tokenize import word_tokenize
from tqdm import tqdm

## Подготовка данных

Загружаем запросы, заголовки

In [2]:
queries = {}
with open('norm_queries.tsv', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip('\n').split('\t')
        if line[0] == '':
            line.pop(0)
        queries[line[0]] = line[1]

titles= {}
with open("norm_titles.tsv" ,'r', encoding='utf-8') as f:
    for line in f.readlines():
        line=line.split('\t')
        titles[line[0]]= line[1][:-1]

В словарях сначала все query_id для трейна, потом для теста, как они идут по порядку в train.marks.tsv, sample.csv

In [3]:
qid2docid = {}
qid2query = {}
docid2title = {}

In [4]:
train_queries_unq = []
with open("train.marks.tsv", 'r', encoding='utf-8') as f:
    for line in f.readlines():
        line=line.strip('\n').split('\t')
        if line[1] in titles.keys():
            if line[0] not in qid2docid:
                qid2docid[line[0]] = []
            qid2docid[line[0]].append(line[1])
            docid2title[line[1]] = titles[line[1]]
            if line[0] not in train_queries_unq:
                qid2query[line[0]] = queries[line[0]]
                train_queries_unq.append(line[0])

In [5]:
test_queries_unq = []
with open("sample.csv", 'r', encoding='utf-8') as f:
    f.readline()
    for line in f.readlines():
        line=line.strip('\n').split(',')
        if line[1] in titles.keys():
            if line[0] not in qid2docid:
                qid2docid[line[0]] = []
            qid2docid[line[0]].append(line[1])
            docid2title[line[1]] = titles[line[1]]
            if line[0] not in test_queries_unq:
                qid2query[line[0]] = queries[line[0]]
                test_queries_unq.append(line[0])

In [6]:
all_queries = list(qid2docid.keys())

## Вычисление признаков

Косинусное расстояние

In [7]:
def cos(sp1,sp2):
    sp1 = sp1.todense()
    sp2 = sp2.todense()
    return np.float64(np.dot(sp1,sp2.T)/np.linalg.norm(sp1)/np.linalg.norm(sp2))

#### tfidf для текстов документов и запросов

In [46]:
def tfidf_score(q, ngr_range, outdir):
    global qid2docid
    global qid2query
    tf = TfidfVectorizer(analyzer = 'char', ngram_range=ngr_range)
    corpus = [qid2query[q]]
    for doc in qid2docid[q]:
        try:
             with open('data/{}.txt'.format(doc), encoding='utf-8') as f:
                corpus.append(f.readline())
        except:
             continue

    tfidf_vectors = tf.fit_transform(corpus)
    result_lines = []
    for i in range(1,len(corpus)):
        line = '\t'.join([q, qid2docid[q][i-1], str(cos(tfidf_vectors[i],tfidf_vectors[0]))]) + '\n'
        result_lines.append(line)
    with open(outdir+'/{}.txt'.format(q),'w') as fout:
        fout.writelines(result_lines)
    fout.close()

In [None]:
out_dir = 'text_tfids_13'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in all_queries:
    if q + '.txt' not in os.listdir(out_dir):
        tfidf_score(q, (1,3), 'text_tfids_13')

In [None]:
out_dir = 'text_tfids_37'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in all_queries:
    if q + '.txt' not in os.listdir(out_dir):
        tfidf_score(q, (3,7), 'text_tfids_37')

#### tfidf для заголовков и запросов

In [10]:
def tfidf_title_score(q, ngr_range, out_dir):
    global qid2docid
    global qid2query
    global docid2title

    tf = TfidfVectorizer(analyzer = 'char', ngram_range=ngr_range)
    corpus = []
    docs = []
    for doc in qid2docid[q]:
        try:
            corpus.append(docid2title[doc])
            docs.append(doc)
        except:
            continue
    corpus.append(qid2query[q])
    tfidf_vectors = tf.fit_transform(corpus)

    with open(out_dir+'/{}.txt'.format(q),'w') as fout:
        for i, doc_id in enumerate(docs):
            fout.write(str(doc_id)+ '\t'+str(cos(tfidf_vectors[i],tfidf_vectors[-1])) + '\n')
    return

In [None]:
out_dir = 'title_tfids_13'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        tfidf_title_score(q, (1,3), 'title_tfids_13')

In [None]:
out_dir = 'title_tfids_38'
try:
     os.mkdir(out_dir)
except:
     n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        tfidf_title_score(q, (3,8), 'title_tfids_38')

#### BM25 для текстов и запросов

In [7]:
def BM25_text_score(q, out_dir):
    global qid2docid
    global qid2query
    corpus = []
    for doc in qid2docid[q]:
        try:
            with open('data/{}.txt'.format(doc), encoding='utf-8') as f:
                corpus.append([doc,word_tokenize(f.readline())[:1000]])
        except:
            continue
    bm = BM25Okapi([doc for idd, doc in corpus])
    scores = bm.get_scores(qid2query[q])
    with open(out_dir+'/'+str(q)+'.txt','w') as fout:
        for i, score in enumerate(scores):
            fout.write(str(corpus[i][0]) + '\t' + str(score) + '\n')
    fout.close()

In [9]:
out_dir = 'bm_texts'
try:
    os.mkdir('bm_texts')
except:
    n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        BM25_text_score(q, out_dir)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:44<00:00,  4.48s/it]


#### Пассажи

для текстов

In [18]:

def passage_score(q, out_dir, window, stride):
    global qid2docid
    global qid2query
    
    texts_dict = {}
    idf_dict = {}
    
    for doc in qid2docid[q]:
        try:
            with open('data/{}.txt'.format(doc), encoding='utf-8') as f:
                texts_dict[doc] = word_tokenize(f.readline())
                for word in texts_dict[doc]:
                    if word not in idf_dict.keys():
                        idf_dict[word] = 0
                    idf_dict[word] += 1
        except:
            continue

    for key in idf_dict.keys():
        idf_dict[key] = len(texts_dict) / idf_dict[key]

    query_words = word_tokenize(qid2query[q])
    query_set = set(query_words)
    with open(out_dir+'/'+q+'.txt','w') as fout:
        for key in texts_dict.keys():
            cur_text = texts_dict[key]
            pas_score = 0
            for i in range(0, len(cur_text) - window, stride):
                scut = set(cur_text[i:i+window])
                inters = set.intersection(scut, query_set)
                if len(inters) > 0:
                    cidf = 0
                    for w in inters:
                        cidf+=idf_dict[w]
                    for w in inters:
                        for k in inters:
                            if w != k:
                                if (cur_text.index(w) - cur_text.index(k))*(query_words.index(w) - query_words.index(k)) > 0:
                                    cidf *= 1.05
                                if abs((cur_text.index(w) - cur_text.index(k)))<=abs((query_words.index(w) - query_words.index(k))):
                                    cidf *= 1.001
                    pas_score+=cidf*(len(cur_text) - window - i)
            fout.write(str(key) + '\t' + str(pas_score) + '\n')

In [None]:
out_dir = 'pass_text_31'
try:
    os.mkdir('pass_text_31')
except:
    n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        passage_score(q, out_dir, 3, 1)

In [34]:
out_dir = 'pass_text_32'
try:
    os.mkdir('pass_text_32')
except:
    n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        passage_score(q, out_dir, 3, 2)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:20<00:00,  6.88s/it]


In [22]:
out_dir = 'pass_text_73'
try:
    os.mkdir('pass_text_73')
except:
    n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        passage_score(q, out_dir, 7, 3)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:20<00:00,  6.79s/it]


Для заголовков

In [23]:
def passage_score_title(q, out_dir, window, stride):
    global qid2docid
    global qid2query
    global docid2title
    
    texts_dict = {}
    idf_dict = {}
    for doc in qid2docid[q]:
        try:
            texts_dict[doc] = word_tokenize(docid2title[doc])
            for w in texts_dict[doc]:
                if w not in idf_dict.keys():
                    idf_dict[w] = 0
                idf_dict[w] += 1
        except:
            continue

    for key in idf_dict.keys():
        idf_dict[key] = len(texts_dict) / idf_dict[key]

    query_words = word_tokenize(qid2query[q])
    query_set = set(query_words)
    with open(out_dir+'/'+q+'.txt','w') as fout:
        for key in texts_dict.keys():
            cur_text = texts_dict[key]
            pas_score = 0
            for i in range(0, len(cur_text) - window, stride):
                scut = set(cur_text[i:i+window])
                inters = set.intersection(scut, query_set)
                if len(inters) > 0:
                    cidf = 0
                    for w in inters:
                        cidf+=idf_dict[w]
                    for w in inters:
                        for k in inters:
                            if w != k:
                                if (cur_text.index(w) - cur_text.index(k))*(query_words.index(w) - query_words.index(k)) > 0:
                                    cidf *= 1.05
                                if abs((cur_text.index(w) - cur_text.index(k)))<=abs((query_words.index(w) - query_words.index(k))):
                                    cidf *= 1.01
                    pas_score+=cidf*(len(cur_text) - window - i)
            fout.write(str(key) + '\t' + str(pas_score) + '\n')

In [40]:
out_dir = 'pass_title_31'
try:
    os.mkdir('pass_title_31')
except:
    n = None

for q in tqdm(all_queries):
    if q + '.txt' not in os.listdir(out_dir):
        passage_score_title(q, out_dir, 3, 1)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 97.57it/s]
