In [1]:
!pip install -U sentence-transformers rank_bm25 torch

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.6 MB/s 
[?25hCollecting rank_bm25
  Downloading rank_bm25-0.2.1-py3-none-any.whl (8.5 kB)
Collecting torch
  Downloading torch-1.10.1-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.5 MB/s eta 0:00:33tcmalloc: large alloc 1147494400 bytes == 0x55ff534ea000 @  0x7f821d0dd615 0x55ff190eb4cc 0x55ff191cb47a 0x55ff190ee2ed 0x55ff191dfe1d 0x55ff19161e99 0x55ff1915c9ee 0x55ff190efbda 0x55ff19161d00 0x55ff1915c9ee 0x55ff190efbda 0x55ff1915e737 0x55ff191e0c66 0x55ff1915ddaf 0x55ff191e0c66 0x55ff1915ddaf 0x55ff191e0c66 0x55ff1915ddaf 0x55ff190f0039 0x55ff19133409 0x55ff190eec52 0x55ff19161c25 0x55ff1915c9ee 0x55ff190efbda 0x55ff1915e737 0x55ff1915c9ee 0x55ff190efbda 0x55ff1915d915 0x55ff190efafa 0x55ff1915dc0d 0x55ff1915c9ee
[K     |████████████████████████████████| 881.9 MB 17 kB/s 
[?25hCollecting trans

In [2]:
import csv
import json
import os
import string

import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sklearn.feature_extraction import _stop_words
import torch
from tqdm.autonotebook import tqdm

In [3]:
if not torch.cuda.is_available():
    print("Внимание, GPU не обнаружен!")
else:
    print('OK')

OK


In [66]:
# тестовые примеры запросов
sample_questions = [
    'How to detect if hard disk is HDD or SSD?',
    'Asus notebook touchpad not working on ubuntu 18.04',
    'how to both show stdout at terminal and send to file',
    'how to increase swap file size?',
    'show temperatures on gnome panel',
    'set high cpu performance mode in ubuntu',
    'how can i install root certificate in Ubuntu?',
    'automatically shutdown ubuntu after timeout',
    'remove stale linux kernels',
    'how to keep swapfile working after reboot',
    'how to completely remove ubuntu package and install again?',
    'how to change root password',
    'Find list of linux users',
    'locate binary file in system directories by file name',
    'how to determine ssh port different from 22?',
    'detect ubuntu version',
    'alt+shift not working on ubuntu 18.04',
    'print screen current active window',
    'how to change audio output device in ubuntu?',
    'install skype for linux'
]

In [5]:
# датасет был заранее приготовлен (см. код проекта) из XML файлов дампа StackExchange. В качестве источника данных был выбран не весь архив StackExchange, а только 1 проект - AskUbuntu.
!mkdir -p import_data
!wget https://stackexchange-api.wizzzet.ru/media/passages.csv.gz -P import_data/

--2021-12-17 16:19:47--  https://stackexchange-api.wizzzet.ru/media/passages.csv.gz
Resolving stackexchange-api.wizzzet.ru (stackexchange-api.wizzzet.ru)... 91.239.26.135
Connecting to stackexchange-api.wizzzet.ru (stackexchange-api.wizzzet.ru)|91.239.26.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 98514333 (94M) [application/octet-stream]
Saving to: ‘import_data/passages.csv.gz’


2021-12-17 16:19:56 (11.7 MB/s) - ‘import_data/passages.csv.gz’ saved [98514333/98514333]



In [6]:
!gunzip -f import_data/passages.csv.gz

In [7]:
# подготовка датасета 
passages = []
passages_cache = []

with open('import_data/passages.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    i = 0
    for row in reader:
        # post_id - идентификатор вопроса
        # идентификатор ответа на вопрос
        # sentence_type: тип текста, принимает значения: 3 - заголовок вопроса, 2 - тело запроса, 1 - тело ответа
        # passage - собственно текст
        post_id, answer_id, sentence_type, passage = row

        i += 1
        if i % 100000 == 0:
            print(
                f'{(str(i) + ")").ljust(12)}'
                f'{str(post_id).ljust(12)}'
                f'{str(answer_id).ljust(12)}'
                f'{sentence_type}: '
                f'{passage[:50]}'
            )

        passages.append(passage)
        # для того, чтобы восстановить исходные идентификаторы и источники
        passages_cache.append(row)

100000)     1445708     1445711     3: should I worry about this "Couldn't find valid fil
200000)     1517936     1517944     2: I installed the System Load Indicator GNOME extens
300000)     1193135     1193136     3: Distribute "home" in 2 hard disks
400000)     1684791     1684794     2: I have phpstorm, datagrip, and pycharm installed o
500000)     1493776     1494716     1: Well, here's my answer. Do you need your sign, LOL
600000)     893164      1023357     1: I found some sites where they do the: sudo adduser
700000)     1055588     1086016     1: This Cisco stuff changes your /etc/resolv.conf. 
I
800000)     1235167     1235191     2: I am following this tutorial for writing pango app


In [8]:
# токенизатор текста для BM25
def bm25_tokenize(text):
    tokens = filter(
        lambda x: x and x not in _stop_words.ENGLISH_STOP_WORDS,
        [t.strip(string.punctuation) for t in text.lower().split()]
    )
    return tuple(tokens)

In [9]:
# сборка корпуса предложений для BM25
bm25_tokenized_corpus = []
for passage in tqdm(passages):
    bm25_tokenized_corpus.append(bm25_tokenize(passage))

bm25 = BM25Okapi(bm25_tokenized_corpus)

  0%|          | 0/881330 [00:00<?, ?it/s]

In [10]:
def search_bm25(query, do_print=False):
    """
    Поиск похожих вопросов или ответов на вопрос
    путем сравнения скорингов BM25
    """
    bm25_scores = bm25.get_scores(bm25_tokenize(query))
    top_idx = np.argpartition(bm25_scores, -10)[-10:]
    hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_idx]
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    if do_print:
        for hit in hits:
            print(str(hit['score']).ljust(16), passages[hit['corpus_id']].replace('\n', ' '))
    return hits

In [30]:
%%time
query = sample_questions[2]
print('Query', query)
# работает очень медленно на большой выборке (ок. 800 тыс. записей)
bm25_hits = search_bm25(query, do_print=True)

Query how to both show stdout at terminal and send to file
18.583288263495565 Let's create a test function that sends some output to both stdout and stderr: $ cmd() { echo 1 on stdout; echo 2 on stderr &gt;&amp;2; echo 3 on stdout; echo 4 on stderr &gt;&amp;2; } Running it we see: $ cmd 1 on stdout 2 on stderr 3 on stdout 4 on stderr Now, we want to send the stdout to a file, and the stderr to both the file and the screen. For this we use a simple redirection for stdout, and for stderr we redirect into a process substitution that appends to the file and sends the output back to
18.5705888493834 You have redirected stderr to stdout (the terminal), then you've redirected stdout to a file. In conclusion, you haven't redirected stderr to the file: stderr -&gt; stdout, stderr goes to the terminal. stdout -&gt; $logfile, stdout goes to the $logfile. Try using the following: application &gt;"$logfile" 2&gt;&amp;1 Notice the order matters: stdout -&gt; $logfile, stdout goes to $logfile. stderr

In [12]:
# В качестве более быстрого решения, при этом учитывающего контекст запроса:
ss_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
ss_encoder.max_seq_length = 256  # 512

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.22k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
%%time
corpus_embeddings = ss_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/27542 [00:00<?, ?it/s]

CPU times: user 37min 53s, sys: 22.5 s, total: 38min 16s
Wall time: 37min 34s


In [14]:
corpus_embeddings.shape

torch.Size([881330, 384])

In [15]:
# количество результатов
TOP_RESULTS = 100

In [25]:
def search_sent_transform(query, top_results=10, do_print=False):
    # Кодируем запрос использую кодировщик SentenceTransformer
    query_embedding = ss_encoder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.cuda()
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_results)
    hits = hits[0]  # Получаем результаты по первому запросу
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    if do_print:
        for hit in hits:
            print(str(hit['score']).ljust(16), passages[hit['corpus_id']].replace('\n', ' '))
    return hits

In [31]:
%%time
print('Query', query)
ss_hits = search_sent_transform(query, top_results=10, do_print=True)
print(' ')

Query how to both show stdout at terminal and send to file
0.7521196007728577 Is it possible to show the output on terminal while writing the output of a command to a file?
0.700324296951294 Redirect terminal output to file
0.6997138857841492 Redirect stdout to file and stderr to file and to screen
0.6970229148864746 Write to file output of two commands running together in one terminal
0.6885201334953308 I guess that you want to write the output to file and stdout too.  echo hello &gt; file.txt  does that partially, but it skips the standard output (prints nothing to the console). If you want that too, use the tee command as below: echo hello | tee file.txt It pipes the output of echo to the input of tee, and then tee writes to the screen and into the file.txt too. If you just want to print to file use cat command as below: echo hello | cat &gt;file.txt Also check this answer for how redirects work:
0.6865543127059937 Redirect stdout to a file and stderr to another file
0.6862585544586

In [59]:
# для переранжирования и улучшения оценки подобия использую cross-encoder
cross_encoder = CrossEncoder('cross-encoder/msmarco-MiniLM-L12-en-de-v1', max_length=512)

In [60]:
def search_sent_cross_transform(query, ss_results=TOP_RESULTS, reranked_results=5, do_print=False):
    # Кодируем запрос использую кодировщик SentenceTransformer
    query_embedding = ss_encoder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.cuda()
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=ss_results)
    hits = hits[0]  # Получаем результаты по первому запросу
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    
    cross_pairs = [(query, passages[hit['corpus_id']]) for hit in hits]
    cross_scores = cross_encoder.predict(cross_pairs)

    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    return_hits = hits[0:reranked_results]

    if do_print:
        for hit in return_hits:
            print(
                str(hit['cross-score']).ljust(16),
                str(f'(orig. {hit["score"]})').ljust(22),
                passages[hit['corpus_id']].replace('\n', ' ')
            )
    return return_hits

In [64]:
query = sample_questions[1]
print('Query', query)
cross_hits = search_sent_cross_transform(query, do_print=True)
cross_hits

Query Asus notebook touchpad not working on ubuntu 18.04
9.245278         (orig. 0.9685117602348328) Asus touchpad not working ubuntu 18.04
9.16009          (orig. 0.86109459400177) asus tuf fx504 ubuntu 18.04 touchpad not working
8.962846         (orig. 0.8809049129486084) Touchpad no longer working on Ubuntu 18.04 for ASUS zenbook
8.8703785        (orig. 0.7611626386642456) Asus Laptop Touchpad Not Working on Ubuntu 14.04 or Greater Versions (Elantech Touchpad)
8.599068         (orig. 0.7942292094230652) Touchpad features not working (Ubuntu 14.04 - Asus notebook)


[{'corpus_id': 245374, 'cross-score': 9.245278, 'score': 0.9685117602348328},
 {'corpus_id': 240842, 'cross-score': 9.16009, 'score': 0.86109459400177},
 {'corpus_id': 534080, 'cross-score': 8.962846, 'score': 0.8809049129486084},
 {'corpus_id': 479072, 'cross-score': 8.8703785, 'score': 0.7611626386642456},
 {'corpus_id': 860223, 'cross-score': 8.599068, 'score': 0.7942292094230652}]

In [21]:
# сохранение модели
ss_encoder.save('ss_encoder')

In [22]:
# сохранение корпуса
torch.save(corpus_embeddings, 'ss_encoder/corpus_embeddings.pt')

In [67]:
# вывод тестовых запросов. Результат в baseline (BM25), в bi-encoder и bi-encoder с переранжированием через cross-encoder
for query in sample_questions:
  print('\nЗапрос:', query)
  print('Baseline BM25:')
  bm25_hits = search_bm25(query, do_print=True)
  print('\n')

  print('Bi-encoder:')
  ss_hits = search_sent_transform(query, top_results=10, do_print=True)
  print('')

  print('Bi-encoder с cross-encoder переранжированием:')
  ss_reranked_hits = search_sent_cross_transform(query, do_print=True)
  print('')


Запрос: How to detect if hard disk is HDD or SSD?
Baseline BM25:
24.021630911737557 Problems with the command which makes you know if the hard disk is hdd or ssd
24.021630911737557 Can't see SSD hard disk after install Ubuntu 18.04 on HDD
23.01209692176194 My current setup is an SSD with Windows installed on it and an HDD for data. I bought a second SSD and second HDD. I want to install Ubuntu Server on the second SSD and let it use the second HDD for data. Should I format these hard drives in Windows before I install Ubuntu? Or, should I hook up the hard drives then start the computer up with the Live CD before I let Windows detect the hard drives?
22.24694262697855 How to optimize sata hard disk - HDD (Not A SSD) in ubuntu 12.04 for better performance. Hard Disk ( HDD ): 500 GB - Not a SSD Ram: 4GB 1GB 1 GB Nvidia Graphic Card Intel Core i5 Second Generation Procesor
22.245806778417247 Or, should I hook up the hard drives then start the computer up with  the Live CD before I let Win