## Crawl Help Page

In [1]:
import re
import requests

import lxml.html as lhtml

from tqdm.auto import tqdm

In [2]:
from collections import deque

prefixes = [
    '/bank/help/',
    '/invest/help/',
    '/insurance/help/',
    '/travel/help/',
    '/gorod/help/',
    '/help/sim-cards/',
]

hrefs_queue = deque()
hrefs_queue.extend(prefixes)

In [3]:
hrefs_visited = set()


def match_href(href, prefixes):
    return any(href.startswith(p) for p in prefixes)


def preprocess_href(href):
    href = href.strip()
    href = re.sub(r'^(https?://)?(www\.)?tinkoff\.ru', '', href)
    href = re.sub(r'/[?#](.*?)$', '/', href)
    if not href.startswith('/'):
        print(href)
        href = '/help/'
    return href


with tqdm() as pbar:
    while hrefs_queue:
        href = hrefs_queue.pop()
        if href in hrefs_visited:
            continue

        hrefs_visited.add(href)

        href = 'https://www.tinkoff.ru' + href
        page = requests.get(href)

        text = page.content.decode('utf8')
        tree = lhtml.fromstring(text)

        hrefs = tree.xpath('//a[contains(@href, "/help/")]/@href')
        hrefs = [preprocess_href(href) for href in hrefs]
        hrefs = [href for href in hrefs if match_href(href, prefixes=prefixes)]
        hrefs = sorted(set(hrefs))

        hrefs_queue.extend(hrefs)

        pbar.update(1)

0it [00:00, ?it/s]

https://notariat.ru/ru-ru/help/probate-cases/
https://www.gosuslugi.ru/help/faq/traffic_accident/102098
https://dolyame.ru/help/customer/about/


In [4]:
len(hrefs_visited), len(hrefs_queue)

(1414, 0)

In [5]:
class Node:
    def __init__(self, ch=None, is_terminal=False):
        self.ch = ch
        self.is_terminal = is_terminal
        self.children = {}

class Trie:
    def __init__(self):
        self.root = Node('')
    
    def add(self, word):
        node = self.root
        for w in word:
            if w not in node.children:
                node.children[w] = Node(w)
            node = node.children[w]
        node.is_terminal = True
        return node

In [6]:
trie = Trie()

for w in sorted(hrefs_visited):
    w = w.strip('/').split('/')
    trie.add(w)

In [7]:
def find_terminal(node, prefix=None, only_leaf=False):
    if prefix is None:
        prefix = []

    prefix.append(node.ch)
    if node.is_terminal:
        if only_leaf and len(node.children) > 0:
            pass
        else:
            yield list(prefix)
    
    for k, v in node.children.items():
        yield from find_terminal(v, prefix=prefix, only_leaf=only_leaf)
    
    prefix.pop()

In [8]:
len(list(find_terminal(trie.root, only_leaf=True)))

1109

In [9]:
hrefs = find_terminal(trie.root, only_leaf=True)
hrefs = ['/'.join(href) for href in hrefs]
print(*hrefs, sep='\n', file=open('href_all_help.txt', mode='w'))

## Download Crawled Pages

In [10]:
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial

In [11]:
DATA_PATH = Path("data")
DATA_PATH.mkdir(exist_ok=True)

In [12]:
hrefs = list(map(str.strip, open('href_all_help.txt')))

In [13]:
def download_page(path, workdir):
    href = "https://www.tinkoff.ru" + path
    page = requests.get(href)
    text = page.content.decode('utf8')

    path = Path(workdir + path)
    path.parent.mkdir(parents=True, exist_ok=True)
    print(text, file=path.open('w'))


download_page(hrefs[0], workdir=str(DATA_PATH))

In [14]:
with ThreadPool(processes=8) as pool:
    func_download_page = partial(download_page, workdir=str(DATA_PATH))
    pool.map(func_download_page, hrefs)

## Parse Pages

In [15]:
import orjson as json

In [16]:
def make_paragraph(p):
    return ' '.join(p.xpath('.//text()'))


def make_article(a):
    qs = ' '.join(a.xpath('.//h2//text()'))
    qs = re.sub(r'[ \xa0]+', ' ', qs).strip()
    
    ps = map(make_paragraph, a.xpath('.//p'))
    ps = '\n\n'.join(ps)
    ps = re.sub(r'\n\n+', '\n\n', ps)
    ps = re.sub(r'[ \xa0]+', ' ', ps)
    ps = ps.strip()
    
    return {'question': qs, 'answer': ps}


def parse_articles(tree):
    articles = tree.xpath('//div[@data-test="question-page"]//article')
    articles = [make_article(e) for e in articles]
    return articles


def process_page(path):
    with open(path) as fd:
        text = fd.read()
    tree = lhtml.fromstring(text)
    
    title = tree.xpath('//h1/text()')[0].strip()
    category = tree.xpath('//div[@role="navigation"]//span[@data-item-type="breadcrumbs"]//text()')
    articles = parse_articles(tree)
    
    return {
        'path': '/' + str(path).split('/', 1)[1],
        'title': title,
        'category': category,
        'articles': articles,
    }

In [17]:
with open('parsed_data.jsonl', mode='wb') as fd:
    for path in tqdm(sorted(DATA_PATH.glob('**/*'))):
        if not path.is_file():
            continue
        
        try:
            res = process_page(path)
        except Exception as e:
            print(f"Error `{e}` occured with: {path}")
            continue
        
        res = json.dumps(res)
        fd.write(res + b'\n')

  0%|          | 0/1520 [00:00<?, ?it/s]

Error `list index out of range` occured with: data/bank/help/debit-cards/junior/category/question-1
Error `list index out of range` occured with: data/bank/help/debit-cards/junior/category/question-2
Error `list index out of range` occured with: data/bank/help/debit-cards/junior/category/question-3
Error `list index out of range` occured with: data/bank/help/debit-cards/tinkoff-black/additional-options/additional-card
Error `list index out of range` occured with: data/bank/help/debit-cards/tinkoff-black/additional-options/cashback-for-charity
Error `list index out of range` occured with: data/bank/help/debit-cards/tinkoff-black/additional-options/installment
Error `list index out of range` occured with: data/bank/help/debit-cards/tinkoff-black/additional-options/koobyshka
Error `list index out of range` occured with: data/bank/help/debit-cards/tinkoff-black/additional-options/overdraft
Error `list index out of range` occured with: data/bank/help/debit-cards/tinkoff-black/additional-opt

In [18]:
!wc -l parsed_data.jsonl
!grep '"articles":\[\]' parsed_data.jsonl | wc -l

    1041 parsed_data.jsonl
      69


In [19]:
!ls -lsh parsed_data.jsonl

18440 -rw-r--r--  1 vladimir  staff   8,2M  3 авг 01:08 parsed_data.jsonl


### Convert To QA-format

In [20]:
with open('parsed_data.jsonl', mode='r') as fd, \
     open('parsed_data_conv.jsonl', mode='wb') as fb:
    for record_str in fd:
        record = json.loads(record_str)
        if not record["articles"]:
            continue
        
        messages = [e for e in record["articles"] if e["question"] and e["answer"]]
        for i, e in enumerate(messages, 1):
            message = {
                "path": record["path"],
                "qpos": i,
                "content": e["question"] + "\n\n" + e["answer"],
            }
            message_str = json.dumps(message)
            
            fb.write(message_str)
            fb.write(b'\n')

## Prepare Index

In [21]:
from dotenv import load_dotenv

load_dotenv()

True

In [22]:
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [23]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["source"] = record.get("path")
    metadata["seq_num"] = record.get("qpos")
    return metadata


loader = JSONLoader(
    file_path="parsed_data_conv.jsonl",
    jq_schema=".",
    content_key="content",
    metadata_func=metadata_func,
    json_lines=True,
)

documents = loader.load()
len(documents)

5663

In [24]:
splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

chunks = splitter.split_documents(documents)
len(chunks)

7865

In [25]:
embeddings = OpenAIEmbeddings()

documents_idx = FAISS.from_documents(documents=chunks, embedding=embeddings)
documents_idx.save_local("parsed_data_conv.idx")

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-pvyCwgUnULtNqPkr8NteKGLT on tokens per min. Limit: 1000000 / min. Current: 716014 / min. Contact us through our help center at help.openai.com if you continue to have issues..
