In [1]:
!pip install -U sentence-transformers rank_bm25 torch



In [2]:
import csv
import json
import os
import string

import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sklearn.feature_extraction import _stop_words
import torch
from tqdm.autonotebook import tqdm

In [3]:
if not torch.cuda.is_available():
    print("Внимание, GPU не обнаружен!")
else:
    print('OK')

OK


In [20]:
sample_questions = [
    'How to detect if hard disk is HDD or SSD?',
    'Asus notebook touchpad not working on ubuntu 18.04',
    'how to both show stdout at terminal and send to file',
    'how to increase swap file size?',
    'show temperatures on gnome panel',
    'set high cpu performance mode in ubuntu',
    'how can i install root certificate in Ubuntu?',
    'automatically shutdown ubuntu after timeout',
    'remove stale linux kernels',
    'how to keep swapfile working after reboot',
    'how to completely remove ubuntu package and install again?',
    'how to change root password',
    'Find list of linux users',
    'locate binary file in system directories by file name',
    'how to determine ssh port different from 22?',
    'detect ubuntu version',
    'alt+shift not working',
    'print screen current active window',
    'how to change audio output device in ubuntu?',
    'install skype for linux'
]

In [4]:
!mkdir -p import_data
!wget https://stackexchange-api.wizzzet.ru/media/passages.csv.gz -P import_data/

--2021-12-17 13:49:42--  https://stackexchange-api.wizzzet.ru/media/passages.csv.gz
Resolving stackexchange-api.wizzzet.ru (stackexchange-api.wizzzet.ru)... 91.239.26.135
Connecting to stackexchange-api.wizzzet.ru (stackexchange-api.wizzzet.ru)|91.239.26.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 98514333 (94M) [application/octet-stream]
Saving to: ‘import_data/passages.csv.gz’


2021-12-17 13:49:49 (13.6 MB/s) - ‘import_data/passages.csv.gz’ saved [98514333/98514333]



In [6]:
!gunzip -f import_data/passages.csv.gz

In [8]:
passages = []
passages_cache = []

with open('import_data/passages.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    i = 0
    for row in reader:
        post_id, answer_id, sentence_type, passage = row

        i += 1
        if i % 100000 == 0:
            print(
                f'{(str(i) + ")").ljust(12)}'
                f'{str(post_id).ljust(12)}'
                f'{str(answer_id).ljust(12)}'
                f'{sentence_type}: '
                f'{passage[:50]}'
            )

        passages.append(passage)
        # для того, чтобы восстановить исходные идентификаторы и источники
        passages_cache.append(row)

100000)     1445708     1445711     3: should I worry about this "Couldn't find valid fil
200000)     1517936     1517944     2: I installed the System Load Indicator GNOME extens
300000)     1193135     1193136     3: Distribute "home" in 2 hard disks
400000)     1684791     1684794     2: I have phpstorm, datagrip, and pycharm installed o
500000)     1493776     1494716     1: Well, here's my answer. Do you need your sign, LOL
600000)     893164      1023357     1: I found some sites where they do the: sudo adduser
700000)     1055588     1086016     1: This Cisco stuff changes your /etc/resolv.conf. 
I
800000)     1235167     1235191     2: I am following this tutorial for writing pango app


In [11]:
def bm25_tokenize(text):
    tokens = filter(
        lambda x: x and x not in _stop_words.ENGLISH_STOP_WORDS,
        [t.strip(string.punctuation) for t in text.lower().split()]
    )
    return tuple(tokens)

In [12]:
bm25_tokenized_corpus = []
for passage in tqdm(passages):
    bm25_tokenized_corpus.append(bm25_tokenize(passage))

bm25 = BM25Okapi(bm25_tokenized_corpus)

  0%|          | 0/881330 [00:00<?, ?it/s]

In [14]:
def search_bm25(query, do_print=False):
    bm25_scores = bm25.get_scores(bm25_tokenize(query))
    top_idx = np.argpartition(bm25_scores, -10)[-10:]
    hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_idx]
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    if do_print:
        for hit in hits:
            print(str(hit['score']).ljust(16), passages[hit['corpus_id']].replace('\n', ' '))
    return hits

In [15]:
%%time
bm25_hits = search_bm25(sample_questions[0], do_print=True)

24.021630911737557 Problems with the command which makes you know if the hard disk is hdd or ssd
24.021630911737557 Can't see SSD hard disk after install Ubuntu 18.04 on HDD
23.01209692176194 My current setup is an SSD with Windows installed on it and an HDD for data. I bought a second SSD and second HDD. I want to install Ubuntu Server on the second SSD and let it use the second HDD for data. Should I format these hard drives in Windows before I install Ubuntu? Or, should I hook up the hard drives then start the computer up with the Live CD before I let Windows detect the hard drives?
22.24694262697855 How to optimize sata hard disk - HDD (Not A SSD) in ubuntu 12.04 for better performance. Hard Disk ( HDD ): 500 GB - Not a SSD Ram: 4GB 1GB 1 GB Nvidia Graphic Card Intel Core i5 Second Generation Procesor
22.245806778417247 Or, should I hook up the hard drives then start the computer up with  the Live CD before I let Windows detect the hard drives? Yes. And you should choose manual par

In [17]:
ss_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
ss_encoder.max_seq_length = 256  # 512

In [18]:
%%time
corpus_embeddings = ss_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/27542 [00:00<?, ?it/s]

CPU times: user 16min 42s, sys: 4min 58s, total: 21min 40s
Wall time: 18min 33s


In [19]:
corpus_embeddings.shape

torch.Size([881330, 384])

In [22]:
TOP_RESULTS = 100

In [23]:
def search_sent_transform(query, top_results=10, do_print=False):
    # Кодируем запрос использую кодировщик SentenceTransformer
    query_embedding = ss_encoder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.cuda()
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_results)
    hits = hits[0]  # Получаем результаты по первому запросу
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    if do_print:
        for hit in hits:
            print(str(hit['score']).ljust(16), passages[hit['corpus_id']].replace('\n', ' '))
    return hits

In [27]:
%%time
ss_hits = search_sent_transform(sample_questions[0], top_results=10, do_print=True)
print(' ')

0.8010199069976807 Problems with the command which makes you know if the hard disk is hdd or ssd
0.7489768266677856 How to check the disk activity of my hard-drive?
0.7453947067260742 How to tell whether my HDD is IDE or SATA
0.7265328168869019 Check if my ssd is SATA or M2
0.717353343963623 How to check HDD info technical?
0.6988400816917419 I want to have a script which tells me what sort of disk I have on my laptop. I have found a command used how to know if my disk is a ssd or hdd i.e. : cat /sys/block/sda/queue/rotational unfortunately I get: cat: /sys/block/sda/queue/rotational: No such file or directory my ubuntu version is 20.10. EDIT: the command: lsblk | grep disk gives nvme0n1 259:0 0 476,9G 0 disk  Is there another command related to my issue?
0.6916237473487854 How can I check the SMART status of a SSD or HDD on current versions of Ubuntu 14.04 through 20.04?
0.6797694563865662 How to check my HDD's for defects
0.6786268353462219 How to check if my Ubuntu is placed on SSD?

In [28]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Downloading:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [29]:
def search_sent_transform(query, ss_results=TOP_RESULTS, reranked_results=5, do_print=False):
    # Кодируем запрос использую кодировщик SentenceTransformer
    query_embedding = ss_encoder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.cuda()
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=ss_results)
    hits = hits[0]  # Получаем результаты по первому запросу
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    
    cross_pairs = [(query, passages[hit['corpus_id']]) for hit in hits]
    cross_scores = cross_encoder.predict(cross_pairs)

    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    return_hits = hits[0:reranked_results]

    if do_print:
        for hit in return_hits:
            print(
                str(hit['cross-score']).ljust(16),
                str(f'(orig. {hit["score"]})').ljust(22),
                passages[hit['corpus_id']].replace('\n', ' ')
            )
    return return_hits

In [30]:
ss_reranked_hits = search_sent_transform(sample_questions[0], do_print=True)

5.2528973        (orig. 0.8010199069976807) Problems with the command which makes you know if the hard disk is hdd or ssd
-2.3995855       (orig. 0.7489768266677856) How to check the disk activity of my hard-drive?
-2.4500012       (orig. 0.7453947067260742) How to tell whether my HDD is IDE or SATA
-1.3704537       (orig. 0.7265328168869019) Check if my ssd is SATA or M2
-2.3349571       (orig. 0.717353343963623) How to check HDD info technical?


In [35]:
ss_encoder.save('ss_encoder')

In [40]:
torch.save(corpus_embeddings, 'ss_encoder/corpus_embeddings.pt')