In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Загрузка данных.

In [4]:
# Очищенный датасет вопросов
questions_clear = pd.read_csv('../Data/Processed/Questions.csv', encoding='latin1')
# Выводим первые 5 строк
questions_clear.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,write database generation script sql want exec...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,really good tutorial explain branching merge a...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,anyone get experience create sqlbased aspnet s...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,something pseudosolved many time never quite f...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,little game write c us database backend tradin...


In [5]:
# Очищенный датасет ответов
answers_clear = pd.read_csv('../Data/Processed/Answers.csv', encoding='latin1')
# Выводим первые 5 строк
answers_clear.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,version control subversion good resource sourc...
1,124,26.0,2008-08-01T16:09:47Z,80,12,wound use kind hack actually work pretty well ...
2,199,50.0,2008-08-01T19:36:46Z,180,1,read somewhere human eye distinguish less 4 va...
3,269,91.0,2008-08-01T23:49:57Z,260,4,yes thought soon figure another domainspecific...
4,307,49.0,2008-08-02T01:49:46Z,260,28,oleg shilos c script solution code project rea...


## Генерация эмбеддингов

In [6]:
# Находим Id лучшего ответа для каждого вопроса
best_answers_clear = answers_clear.loc[answers_clear.groupby('ParentId')['Score'].idxmax()]

# Объединяем с вопросами (inner join)
merged = pd.merge(
    questions_clear[['Id', 'Title', 'Body', 'Score', 'CreationDate']],
    best_answers_clear[['ParentId', 'Body', 'Score', 'CreationDate']],
    left_on='Id',
    right_on='ParentId',
    suffixes=('_question', '_answer')
)

# Заменяем Nan в ответах на пустую строку
merged['Body_answer'] = merged['Body_answer'].fillna('')
# Заменяем Nan в вопросах на пустую строку
merged['Body_question'] = merged['Body_question'].fillna('')

# Собираем единый текст для эмбеддинга: вопрос + ответ
merged['text'] = merged['Body_question'] + ' ' + merged['Body_answer']

merged['text']

0          write database generation script sql want exec...
1          really good tutorial explain branching merge a...
2          anyone get experience create sqlbased aspnet s...
3          something pseudosolved many time never quite f...
4          little game write c us database backend tradin...
                                 ...                        
1102563    around 200000 data excel separate per 15min da...
1102564    able connect linkedin api two month everything...
1102565    building multitenant app rail 4 apartment devi...
1102566    learn c try draw iscosceles triangle use aster...
1102567    need extend shell script bash much familiar py...
Name: text, Length: 1102568, dtype: object

In [5]:
import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# Проверяем доступность GPU
gpu_info = torch.cuda.get_device_properties(0)
print(f"GPU: {gpu_info.name}, Memory: {gpu_info.total_memory // (1024 ** 2)} MB")

GPU: NVIDIA GeForce RTX 4060, Memory: 8187 MB


In [11]:
# Задаем модель
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
model = model.half()  # Ускоряет в два раза, но убирает точность до 4 знаков

texts = merged['text'].tolist()
batch_size = 4096  # размер батча

embeddings = []

with torch.no_grad():
    for start in tqdm(range(0, len(texts), batch_size), desc="Calculating embeddings"):
        batch = texts[start:start + batch_size]
        emb = model.encode(
            batch,
            device='cuda',
            show_progress_bar=False,
            convert_to_numpy=True,  # быстрее и меньше памяти
            num_workers=12,  # количество потоков для обработки
        )
        embeddings.append(emb)

# Склеиваем батчи в одну матрицу
emb_matrix = np.vstack(embeddings)

print(f"Эмбеддинги рассчитаны: {emb_matrix.shape[0]} объектов, размерность {emb_matrix.shape[1]}")

Calculating embeddings:   0%|          | 0/270 [00:00<?, ?it/s]

Эмбеддинги рассчитаны: 1102568 объектов, размерность 384


In [33]:
# Сохраняем эмбеддинги в файл
np.save('../Data/Embeddings/embeddings.npy', emb_matrix)

In [7]:
# Загружаем эмбеддинги из файла
emb_matrix = np.load('../Data/Embeddings/embeddings.npy')
print(f"Эмбеддинги загружены: {emb_matrix.shape[0]} объектов, размерность {emb_matrix.shape[1]}")

Эмбеддинги загружены: 1102568 объектов, размерность 384


## Развертывание Chroma DB

In [8]:
# Удаляем из merged Body_question и Body_answer
merged = merged.drop(columns=['Body_question', 'Body_answer'])
merged

Unnamed: 0,Id,Title,Score_question,CreationDate_question,ParentId,Score_answer,CreationDate_answer,text
0,80,SQLStatement.execute() - multiple queries in o...,26,2008-08-01T13:57:07Z,80,12,2008-08-01T16:09:47Z,write database generation script sql want exec...
1,90,Good branching and merging tutorials for Torto...,144,2008-08-01T14:41:24Z,90,19,2009-09-23T15:40:46Z,really good tutorial explain branching merge a...
2,120,ASP.NET Site Maps,21,2008-08-01T15:50:08Z,120,9,2008-09-23T22:41:11Z,anyone get experience create sqlbased aspnet s...
3,180,Function for creating color wheels,53,2008-08-01T18:42:19Z,180,21,2008-08-02T19:03:52Z,something pseudosolved many time never quite f...
4,260,Adding scripting functionality to .NET applica...,49,2008-08-01T23:22:08Z,260,28,2008-08-02T01:49:46Z,little game write c us database backend tradin...
...,...,...,...,...,...,...,...,...
1102563,40142860,Adding large data in Excel,0,2016-10-19T23:01:07Z,40142860,0,2016-10-19T23:14:50Z,around 200000 data excel separate per 15min da...
1102564,40142900,LinkedIN Encounter error: Your application has...,0,2016-10-19T23:05:07Z,40142900,0,2016-10-19T23:16:10Z,able connect linkedin api two month everything...
1102565,40142910,Validation for must_be_below_user_limit allowi...,0,2016-10-19T23:05:49Z,40142910,0,2016-10-19T23:58:58Z,building multitenant app rail 4 apartment devi...
1102566,40142940,Drawing an iscosceles triangle of asteriks on C++,-1,2016-10-19T23:08:42Z,40142940,1,2016-10-19T23:30:12Z,learn c try draw iscosceles triangle use aster...


In [9]:
# Присоединяем эмбеддинги к merged
merged['embeddings'] = list(emb_matrix)
merged

Unnamed: 0,Id,Title,Score_question,CreationDate_question,ParentId,Score_answer,CreationDate_answer,text,embeddings
0,80,SQLStatement.execute() - multiple queries in o...,26,2008-08-01T13:57:07Z,80,12,2008-08-01T16:09:47Z,write database generation script sql want exec...,"[0.01357938, -0.024715697, -0.056818295, 0.089..."
1,90,Good branching and merging tutorials for Torto...,144,2008-08-01T14:41:24Z,90,19,2009-09-23T15:40:46Z,really good tutorial explain branching merge a...,"[0.022999706, -0.054095984, -0.04250651, -0.07..."
2,120,ASP.NET Site Maps,21,2008-08-01T15:50:08Z,120,9,2008-09-23T22:41:11Z,anyone get experience create sqlbased aspnet s...,"[0.002508051, -0.064242, -0.015580753, 0.00925..."
3,180,Function for creating color wheels,53,2008-08-01T18:42:19Z,180,21,2008-08-02T19:03:52Z,something pseudosolved many time never quite f...,"[0.009283214, 0.027419629, -0.014058785, -0.06..."
4,260,Adding scripting functionality to .NET applica...,49,2008-08-01T23:22:08Z,260,28,2008-08-02T01:49:46Z,little game write c us database backend tradin...,"[-0.015197881, -0.029962653, -0.12446398, 0.04..."
...,...,...,...,...,...,...,...,...,...
1102563,40142860,Adding large data in Excel,0,2016-10-19T23:01:07Z,40142860,0,2016-10-19T23:14:50Z,around 200000 data excel separate per 15min da...,"[-0.041066702, 0.094635084, 0.016694594, 0.017..."
1102564,40142900,LinkedIN Encounter error: Your application has...,0,2016-10-19T23:05:07Z,40142900,0,2016-10-19T23:16:10Z,able connect linkedin api two month everything...,"[-0.07177593, -0.01753621, 0.07370899, -0.0023..."
1102565,40142910,Validation for must_be_below_user_limit allowi...,0,2016-10-19T23:05:49Z,40142910,0,2016-10-19T23:58:58Z,building multitenant app rail 4 apartment devi...,"[0.0020394926, -0.033933215, -0.04788427, -0.0..."
1102566,40142940,Drawing an iscosceles triangle of asteriks on C++,-1,2016-10-19T23:08:42Z,40142940,1,2016-10-19T23:30:12Z,learn c try draw iscosceles triangle use aster...,"[0.0074667125, 0.11854785, -0.06675289, -0.068..."


In [10]:
# Переименовываем столбцы
merged = merged.rename(columns={
    'text': 'Clean_text',
    'Title': 'Title',
    'Score': 'Score_question',
    'CreationDate': 'CreationDate_question',
    'Score_answer': 'Score_answer'
})
merged

Unnamed: 0,Id,Title,Score_question,CreationDate_question,ParentId,Score_answer,CreationDate_answer,Clean_text,embeddings
0,80,SQLStatement.execute() - multiple queries in o...,26,2008-08-01T13:57:07Z,80,12,2008-08-01T16:09:47Z,write database generation script sql want exec...,"[0.01357938, -0.024715697, -0.056818295, 0.089..."
1,90,Good branching and merging tutorials for Torto...,144,2008-08-01T14:41:24Z,90,19,2009-09-23T15:40:46Z,really good tutorial explain branching merge a...,"[0.022999706, -0.054095984, -0.04250651, -0.07..."
2,120,ASP.NET Site Maps,21,2008-08-01T15:50:08Z,120,9,2008-09-23T22:41:11Z,anyone get experience create sqlbased aspnet s...,"[0.002508051, -0.064242, -0.015580753, 0.00925..."
3,180,Function for creating color wheels,53,2008-08-01T18:42:19Z,180,21,2008-08-02T19:03:52Z,something pseudosolved many time never quite f...,"[0.009283214, 0.027419629, -0.014058785, -0.06..."
4,260,Adding scripting functionality to .NET applica...,49,2008-08-01T23:22:08Z,260,28,2008-08-02T01:49:46Z,little game write c us database backend tradin...,"[-0.015197881, -0.029962653, -0.12446398, 0.04..."
...,...,...,...,...,...,...,...,...,...
1102563,40142860,Adding large data in Excel,0,2016-10-19T23:01:07Z,40142860,0,2016-10-19T23:14:50Z,around 200000 data excel separate per 15min da...,"[-0.041066702, 0.094635084, 0.016694594, 0.017..."
1102564,40142900,LinkedIN Encounter error: Your application has...,0,2016-10-19T23:05:07Z,40142900,0,2016-10-19T23:16:10Z,able connect linkedin api two month everything...,"[-0.07177593, -0.01753621, 0.07370899, -0.0023..."
1102565,40142910,Validation for must_be_below_user_limit allowi...,0,2016-10-19T23:05:49Z,40142910,0,2016-10-19T23:58:58Z,building multitenant app rail 4 apartment devi...,"[0.0020394926, -0.033933215, -0.04788427, -0.0..."
1102566,40142940,Drawing an iscosceles triangle of asteriks on C++,-1,2016-10-19T23:08:42Z,40142940,1,2016-10-19T23:30:12Z,learn c try draw iscosceles triangle use aster...,"[0.0074667125, 0.11854785, -0.06675289, -0.068..."


In [21]:
# Удалим все кроме merged
del questions_clear
del answers_clear
del best_answers_clear
del emb_matrix

import gc
gc.collect()

239

In [20]:
import chromadb
from chromadb.config import Settings
import uuid
from tqdm.notebook import tqdm

# Инициализация Chroma
chroma_client = chromadb.Client(Settings(
    persist_directory="./chroma_data"  # Папка для хранения данных
))

# Создание новой коллекции
collection = chroma_client.create_collection(
    name="Alpha",
    metadata={"hnsw:space": "cosine"}  # Используем косинусное расстояние
)

# Добавляем документы в коллекцию
batch_size = 1000  # Размер батча

for i in tqdm(range(0, len(merged), batch_size), desc="Adding documents to Chroma"):
    batch = merged.iloc[i:i + batch_size]
    # Генерируем уникальные идентификаторы
    ids = [str(uuid.uuid4()) for _ in range(len(batch))]

    # Добавляем документы в коллекцию
    collection.add(
        documents=batch['Clean_text'].tolist(),
        metadatas=batch[
            ['Id', 'ParentId', 'Title', 'Score_question', 'CreationDate_question', 'Score_answer']].to_dict(
            orient='records'),
        embeddings=batch['embeddings'].tolist(),
        ids=ids
    )

# Проверяем количество документов в коллекции
print(f"Всего документов в коллекции: {collection.count()}")

Adding documents to Chroma:   0%|          | 0/1103 [00:00<?, ?it/s]

Всего документов в коллекции: 1102568


In [16]:
# Удаляем коллекцию
# chroma_client.delete_collection("Alpha")

In [22]:
# Удаляем merged
del merged

gc.collect()

0

## Реализация семантического поиска

In [23]:
# Очистка текста
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')  # Загрузка стоп-слов
nltk.download('wordnet')  # Загрузка WordNet
nltk.download('averaged_perceptron_tagger_eng')  # Загрузка тегов

[nltk_data] Downloading package stopwords to C:\Users\VORANDPAV BIG
[nltk_data]     SPB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\VORANDPAV BIG
[nltk_data]     SPB\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\VORANDPAV BIG
[nltk_data]     SPB\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [24]:
stop_words = set(stopwords.words('english'))  # Загрузка стоп-слов
contractions = ['ive', 'youre', 'theyre', 'hes', 'shes', 'its', 'weve', 'theyve', 'im', 'isnt', 'wasnt', 'werent',
                'hasnt', 'havent', 'dont', 'doesnt', 'didnt', 'cant', 'couldnt', 'shouldnt', 'mightnt', 'mustnt',
                'wouldnt']  # Список сокращений
lemmatizer = WordNetLemmatizer()  # Загрузка лемматизатора


# Функция опеределения части речи для лемматизации
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)


# Функция для очистки текста
def clean_text(text):
    if not isinstance(text, str):  # Проверяем, что текст является строкой
        return ''

    text = re.sub(r'<code>.*?</code>', '', text, flags=re.DOTALL)  # Удаляем блоки <code>
    text = re.sub(r'\`[^\`]*\`', '', text)  # Удаляем инлайн-код в Markdown
    text = re.sub(r'<[^>]*>', '', text)  # Удаляем оставшиеся HTML-теги
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)  # Удаляем Markdown-ссылки
    text = re.sub(r'"[^"]*"', '', text)  # Удаляем текст в кавычках
    text = re.sub(r'\s+', ' ', text).strip()  # Удаляем лишние пробелы

    text = text.lower()  # Приводим текст к нижнему регистру
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Удаляем все кроме букв, цифр и пробелов

    words = text.split()  # Разбиваем текст на слова
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if
             word not in stop_words and word not in contractions]  # Удаляем стоп-слова и сокращения, лемматизируем слова
    text = ' '.join(words)  # Объединяем слова обратно в текст

    return text

In [25]:
# Загружаем модель
from sentence_transformers import SentenceTransformer

model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)


def semantic_search(query, top_k=5):
    """
    Функция для семантического поиска в Chroma
    :param query: текст запроса
    :param top_k: количество ближайших соседей
    :return: список найденных документов
    """
    # Очищаем текст запроса
    query = clean_text(query)

    # Вычисляем эмбеддинг запроса
    query_embedding = model.encode(
        [query],
        device='cuda',
        show_progress_bar=False,
        convert_to_numpy=True,
        num_workers=12,
    )[0]

    # Выполняем поиск ближайших соседей
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    return results

## Тестирование семантического поиска

In [26]:
# Пример поиска
# Лучший ответ совпал с 4 лабой - правильный ответ
query = "<p>I've written a database generation script in <a href=""http://en.wikipedia.org/wiki/SQL"">SQL</a> and want to execute it in my <a href=""http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime"">Adobe AIR</a> application:</p>"
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['236dcf46-0fde-43d3-9559-c44b199c111a',
   '04c1c460-9482-4a10-8e5b-a6446039ee5a',
   'b6d8bef2-2b52-4a3d-b5ea-7d6b58e2c925',
   '08037cac-4522-48ea-a264-372217f11ccf',
   'ce303ff3-cb9a-47ef-bdad-086bf8a24ff5']],
 'embeddings': None,
 'documents': [['write database generation script sql want execute adobe air application execute adobe air use follow method error generate however exists seem look first query semicolon remove query fails way call multiple query one statement wound use kind hack actually work pretty well thing careful semicolon',
   'whenever try create one table actionscript first one get run use string query embed external file sql code every time seem ignore everything first perhaps feature actionscript deal database add table separate query way get around like idea whole db create statement set one contain place seem multiple statement arent support air sqlite command might separate script statement execute sequentially',
   'would like create database tabl

In [58]:
# Пример поиска
# Похожий вопрос, но не тот же самый
# Результат - 2 ответ
query = "<p>I've made a database maker script in <a href=""http://en.wikipedia.org/wiki/SQL"">SQL</a> and want to use it in my <a href=""http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime"">Adobe AIR</a> application:</p>"
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['04c1c460-9482-4a10-8e5b-a6446039ee5a',
   '236dcf46-0fde-43d3-9559-c44b199c111a',
   'b6d8bef2-2b52-4a3d-b5ea-7d6b58e2c925',
   'ce303ff3-cb9a-47ef-bdad-086bf8a24ff5',
   '6faf4b3e-ecfb-4253-bc2f-8cfb73bc543f']],
 'embeddings': None,
 'documents': [['whenever try create one table actionscript first one get run use string query embed external file sql code every time seem ignore everything first perhaps feature actionscript deal database add table separate query way get around like idea whole db create statement set one contain place seem multiple statement arent support air sqlite command might separate script statement execute sequentially',
   'write database generation script sql want execute adobe air application execute adobe air use follow method error generate however exists seem look first query semicolon remove query fails way call multiple query one statement wound use kind hack actually work pretty well thing careful semicolon',
   'would like create database tabl

In [27]:
# Пример поиска
# Ни один ответ не совпал с 4 лабой - неправильный ответ
query = "<p>I am porting the unity game in windows store game so have generated the windows store build from unity4.2.2 when i build the unity build solution from visual studio 2013 on windows 8.1 platform (retargetted the solution to 8.1 ) i am getting error at following line in AppManifest.xml</p>"
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['f553d986-9473-4093-babb-1ee25283e41f',
   'bac7bac5-e5c6-41b3-917d-be87f8a4fab1',
   'eb2ba2e5-9353-4e4f-9cc6-b65d56620ab5',
   'dd6f683a-804a-4a6c-990c-732610a16e6d',
   '6e1645fe-1582-4b86-8c25-daf638a29c0b']],
 'embeddings': None,
 'documents': [['use unity 531f1 vs2015 update 1 try export project window 10 universal xaml even try d3d get 300 error able run blank windows10 universal app create use vs2015 try reinstall update vs2015unity game work export window 881 platformwebgl window desktop edit go toolsoptionsnuget package manager tick allow nuget download miss package tick automatically check miss package build v',
   'visual studio 2013 solution include window 81 winjs app along mvc web application build solution locally visual studio work fine building build server fails follow error getsolutionconfigurationcontents could software versioning issue local environment instal build server idea would helpful track thanks problem packageappxmanifest file packaging tab set

In [28]:
# Пример поиска
# Лучший ответ совпал с 4 лабой - правильный ответ
query = "I am building a test for an MVC5 controller method.  I'm using moq for the test.  What I'm interested in is how to test a controller method that requires authentication and uses the userid value not the username value to make decisions about what data to show to the browser/client."
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['6eed49f7-c4c5-44af-a3bb-2450d2ca0e03',
   '993a25e9-9f78-45a3-8d3a-5ed4daffad38',
   '21c77ef1-beb6-4d2f-b706-469bf8d5a658',
   '0b36f01d-e947-4944-9e20-c89977b78424',
   '5d2199a6-cc9e-4085-bdf4-7d7deeb68191']],
 'embeddings': None,
 'documents': [['building test mvc5 controller method use moq test interested test controller method require authentication us userid value username value make decision data show browserclient research far considerable amount code available moq username much code userid value case look like guid use aspnet identity account management owin add well google facebook login use dependency injection use unity seem way accomplish test plus di enables use moq example test method look notnull return controller method getcurrentuserid method query useridentity object userid use code method supposedly get replace moq getcurrentuserid method virtual method helper class myhelper parameter constructor model object modelobject parameter constructor controller 

In [57]:
# Пример поиска
# Похожий вопрос, но не тот же самый
# Результат - 5 ответ
query = "I am making a experiment for an MVC5 controller case.  I'm using moq for the experiment.  What I'm fond of is how to experiment a control method that require"
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['415e6865-6f68-4b2a-bdf2-976b07daf20a',
   '5447d40c-b285-4b9a-8beb-8a3fc1320d60',
   'fbd4bb30-e6f4-48bc-b9eb-e4ae957cc5c5',
   'a3c365bb-79e4-4f0c-acb6-3ec1a47d4b36',
   '6eed49f7-c4c5-44af-a3bb-2450d2ca0e03']],
 'embeddings': None,
 'documents': [['here scenario work net mvc 40 project repository youd expect try implement moqxunit test library net unit test project get far mvc controller unit test class problem controller call directly everything work fine 100 policy load however run test 0 product load guess problem mock call somewhere line potentially service initialisation anyone ever offer advice also correct test service rather repository held data layer thanks advance test code initialize empty list tell mock service return empty list make test load policy need put policy instance list would write test look something like really depends aspect code try unit test see test simply confirms store policy object expect might write test logic depends policy instance',
   'a

In [56]:
# Пример поиска
# Лучший ответ совпал с 4 лабой - правильный ответ
query = r"""<p>Would this be classified as an O(1) algorithm for "Hello, World!" ??</p>"""
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['9dc09d8e-8623-4fae-ac5e-541a60e0d83b',
   '04258003-aaa6-4a2a-a02e-03a9b890ab81',
   '681857a7-c8b3-4412-b933-f79589bb95eb',
   '3d7daa24-b5da-49e4-a071-24fe6d252747',
   '3af968e9-f41e-452c-a7f8-6a94c2ffba16']],
 'embeddings': None,
 'documents': [['would classify o1 algorithm think use snippet code busy loop put joke whenever someone asks algorithm certain complexity would correct big notation context use describe relationship size input function number operation need perform compute result input operation input output related use big notation nonsensical time operation take independent input operation isnone since relationship input number operation perform use big describe nonexistent relationship',
   'formulate algorithm problem analyse o2mn suppose get omn want know o2mn omn yes big ignores constant',
   'building intent recognition system use multiclass classfication svm currently number class limited training data however future may get data new class course put dat

In [50]:
# Пример поиска
# Похожий вопрос, но не тот же самый
# Результат - 1 ответ
query = r"""Can it be categorized as an O(1) algorithm for "Hi, World!"""
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['3f9945e7-74db-409d-bc99-a519fca89a41',
   '7cd2edef-4f2d-4242-946a-359ccd0fd59f',
   '0f4109d3-e570-4ec5-8015-2cf1f69c1594',
   '9ccb80fe-f41e-4663-8a92-e473199c3c9e',
   '3ff2947e-2647-4180-9c14-c50146e3417a']],
 'embeddings': None,
 'documents': [['would classify o1 algorithm think use snippet code busy loop put joke whenever someone asks algorithm certain complexity would correct big notation context use describe relationship size input function number operation need perform compute result input operation input output related use big notation nonsensical time operation take independent input operation isnone since relationship input number operation perform use big describe nonexistent relationship',
   'selfstudy bigo understand give example follow notation algorithm on2 on3 come across notation quite comprehend give example term algorithm maybe phrase way write algorithm take run time proportion on34 log n3 olog2non 4n n32 fear misunderstand notation notation algorithm 

In [43]:
# Пример поиска
# Лучший ответ совпал с 4 лабой - правильный ответ
query = r"""I cannot get iOS 6 to play any sound at all using the Web Audio API with examples that work fine in desktop Chrome."""
top_results = semantic_search(query, top_k=5)
top_results

{'ids': [['6e440527-e7f1-4adf-8d69-c034f391bc71',
   '02c774f0-3e91-4f92-a766-8f43e3c3c207',
   'db586ec7-4bc9-4dc3-b918-2547dd282752',
   '4acc7c9e-f4a0-4e53-b2ac-f19d29bb4542',
   '361cf918-0373-4cd6-a763-c19a1d887169']],
 'embeddings': None,
 'documents': [['really excite see io 6 support web audio api since make html5 game however cannot get io 6 play sound use web audio api example work fine desktop chrome html5 game touch control play audio via web audio api present fall back html5 audio httpwwwscirracomlabssbios6b edit srikumar suggest workarounds apply version still work httpwwwscirracomlabssbios6f everything play fine desktop chrome io 6 emits sound trouble debug window development io 6 replace debug mode remote web inspector apparently available safari window use alert find correctly identifies web audio api us detects vorbis support fall back aac audio decodes buffer play error thrown hear nothing course try turn volume max codec problem io 6 play aac fine browse one m4as ga