<a href="https://colab.research.google.com/github/tatianamgar/Work_GPN/blob/main/Repeating%20queries%20analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Импорты**

In [None]:
!pip install transformers


In [None]:
!pip install ruamel.yaml


In [None]:
!pip install tensorflow_text

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import os
import sys
from pathlib import Path

root_dir = os.path.abspath(os.getcwd())
root_dir = str(Path(root_dir).parent)
sys.path.append(root_dir)

import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import torch
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast
import tensorflow_hub as hub
import tensorflow_text
import tensorflow as tf
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words("russian")

from src.utilits import read_config




Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Функции**

In [None]:
config_path = "params.yaml"
config = read_config.read(config_path)

In [None]:
class Embed:
    def __init__(self, config):
        self.config = config
        self.batch_size = config["train_parameters"]["bs"]
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.labels = self.__load_tags(config["files"]["tag_file"])
        num_tags = len(self.labels)

        self.model = DistilBertForTokenClassification.from_pretrained(
            config["files"]["path_model_save"],
            num_labels=num_tags,
            output_attentions=False,
            output_hidden_states=False,
        )
        self.model = self.model.to(self.device)
        self.model.eval()

        self.tokenizer = DistilBertTokenizerFast.from_pretrained(
            config["files"]["tokenizer_model"], do_lower_case=False,
        )

    def get(self, data: list):
        embeddings = []
        for i in range(0, len(data), self.batch_size):
            batch = data[i:i+self.batch_size]
            encodings = self.tokenizer(
                batch,
                padding=True,
                truncation=True,
                return_tensors="pt",
                return_offsets_mapping=True,
            )
            input_id = encodings.input_ids.to(self.device)
            attention_mask = encodings.attention_mask.to(self.device)
            with torch.no_grad():
                batch_embeddings = self.model.distilbert(input_id, attention_mask=attention_mask)[0].cpu().numpy()
            embeddings.append(batch_embeddings.mean(axis=1))
        return np.concatenate(embeddings, axis=0)


    @staticmethod
    def __load_tags(tags_path):
        with open(tags_path, "rb") as f:
            tags = pickle.load(f)
        return [tag.replace("B-", "").replace("I-", "") for tag in tags]

In [None]:
embed = Embed(config)
def vectorize_texts_bert(texts):
    embeddings = embed.get(texts)


    return embeddings

In [None]:
# загружаем модель USE
nlp = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

def vectorize_texts_use(texts):
    vecs = nlp(texts)
    return vecs

In [None]:
def preprocess_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = text.lower()

    # пунктуация и пробелы
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub('[\s]+', ' ', text)

    tokens = text.split(" ")


    # стоп-слова и стемминг
    # no_stop_lemm_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords]

    return " ".join(tokens)

In [None]:
def repeated_queries_search(sessions, model = 'bert'):


    sessions['sims_to_prev'] = '[]'

    for ind in tqdm(sessions.index):
        # preprocessing
        pre_list = sessions['query'][ind]
        iter_list = list(map(preprocess_text, pre_list))

        # векторизация и сходство
        if model == 'bert':
            vectors = vectorize_texts_bert(iter_list)
        elif model == 'use':
            vectors = vectorize_texts_use(iter_list)

        smlr = cosine_similarity(vectors, vectors)

        # подготовка итогового листа для дф
        indx_query = 0
        smlr_lst = []
        for ndx, lst in enumerate(smlr):
            if ndx == 0:
                smlr_lst.append(0)
                continue

            else:
                smlr_lst.append(smlr[ndx][indx_query])
                indx_query += 1

        sessions['sims_to_prev'][ind] = list(smlr_lst)

    return sessions




In [None]:
def create_df_with_sim(sessions):

    result = pd.DataFrame(columns=['similarity with previous query', 'index', 'query', 'session_id'])

    for ind in sessions.index:
        df = pd.DataFrame({'similarity with previous query': sessions['sims_to_prev'][ind], 'index' : sessions['level_0'][ind],
                            'query': sessions['query'][ind], 'session_id': sessions['session_id'][ind]})
        result = pd.concat([result, df])

    result = result.reset_index()
    return result



# **Обработка выгрузки**

**Предобработка**
убрать из выборки (здесь предобработки нет, т.к. я передаю уже обработанный датасет для работы в Колабе, в кспд соединю с другим скриптом обработки):
- запросы без айди сессий и запросов
- пустые запросы
- поиск по ID
- запросы, совершенные с помощью кнопки "похожие материалы" (в самом подборе уточняющих запросов будет учтено)

Подбор утоняющих запросов:
- сортировать запросы по времени в каждом айди сессии по возрастанию
- цикл векторизации и кос сходства на лист запросов в каждой сессии

In [None]:
# audit_search таблица
search = pd.read_csv('/content/search_processed.csv', sep='$')
search_sorted = search.sort_values(['session_id', 'date_time'],ascending=True).reset_index()
sessions = search_sorted.groupby("session_id").agg({"query": lambda x: list(x), 'level_0': lambda y: list(y)}).reset_index()
sessions.head(3)

> # **USE**

In [None]:
sessions_use = repeated_queries_search(sessions, model = 'use')
result = create_df_with_sim(sessions_use)
search_sims = pd.merge(search_sorted, result, left_on = ['level_0'], right_index=True)
search_sims = search_sims[['date_time', 'page_number', 'query_x', 'username', 'total_results', 'activeLayer', 'session_id_x', 'similarity with previous query']]
search_sims.head(3)

100%|██████████| 66/66 [00:07<00:00,  8.79it/s]


Unnamed: 0,date_time,page_number,query_x,username,total_results,activeLayer,session_id_x,similarity with previous query
0,2023-06-06 07:30:53.566,1,Лента полиэтиленовая для изоляции нефтегазопро...,Stepina.VV@GAZPROM-NEFT.LOCAL,6,AI,03752a88-da02-4556-be69-4946bc503b43,0.0
1,2023-06-06 03:30:29.004,1,фара ручная,Rubtsov.II@GAZPROM-NEFT.LOCAL,24,AI,0a2c9488-e614-44f8-9307-4e06c26df669,0.0
2,2023-06-06 03:32:28.052,1,фара ручная,Rubtsov.II@GAZPROM-NEFT.LOCAL,24,AI,0a2c9488-e614-44f8-9307-4e06c26df669,1.0


In [None]:
search_sims.to_excel('sim_by_use.xlsx')

> # **BERT**

In [None]:
sessions_bert = repeated_queries_search(sessions)
result = create_df_with_sim(sessions_bert)
search_sims = pd.merge(search_sorted, result, left_on = ['level_0'], right_index=True)
search_sims = search_sims[['date_time', 'page_number', 'query_x', 'username', 'total_results', 'activeLayer', 'session_id_x', 'similarity with previous query']]
search_sims.head(3)

100%|██████████| 66/66 [00:11<00:00,  5.76it/s]


Unnamed: 0,date_time,page_number,query_x,username,total_results,activeLayer,session_id_x,similarity with previous query
0,2023-06-06 07:30:53.566,1,Лента полиэтиленовая для изоляции нефтегазопро...,Stepina.VV@GAZPROM-NEFT.LOCAL,6,AI,03752a88-da02-4556-be69-4946bc503b43,0.0
1,2023-06-06 03:30:29.004,1,фара ручная,Rubtsov.II@GAZPROM-NEFT.LOCAL,24,AI,0a2c9488-e614-44f8-9307-4e06c26df669,0.0
2,2023-06-06 03:32:28.052,1,фара ручная,Rubtsov.II@GAZPROM-NEFT.LOCAL,24,AI,0a2c9488-e614-44f8-9307-4e06c26df669,1.0


! первый запрос в сессии проставляется сходством 0

In [None]:
search_sims.to_excel('sim_by_bert.xlsx')