In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import markdownify 
from IPython.display import display, Markdown
import re
import os
import pandas as pd
from annoy import AnnoyIndex
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
from tqdm._tqdm_notebook import tqdm_notebook


Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
class Document:
    def __init__(self, id, html):
        self.html = html
        self.soup = BeautifulSoup(self.html)
        self.title = self.process_title()
        try:
            self.markdown = self.process_markdown()
        except:
            print(id)
            raise
        self.clean_html = self.process_clean_html()
        self.id = self.lookup(int(id))
        self.original_id = str(id)
    
    def process_title(self):
        title = self.soup.find("title").string.replace(" | Support Vidio", "")
        return title

    def process_markdown(self):
        article = self.soup.find(id='article-body')
        tag_list = article.findAll(lambda tag: len(tag.attrs) > 0)
        for t in tag_list:
            deleted = []
            for attr in t.attrs:
                if attr not in ["src", "href"]:
                    deleted.append(attr)
            for attr in deleted:
                del t.attrs[attr]


        md = markdownify.markdownify(str(self.title) + str(article), heading_style="ATX")
        return md
    
    def process_clean_html(self):
        article = self.soup.find(id='article-body')
        tag_list = article.findAll(lambda tag: len(tag.attrs) > 0)
        for t in tag_list:
            deleted = []
            for attr in t.attrs:
                if attr not in ["src", "href"]:
                    deleted.append(attr)
            for attr in deleted:
                del t.attrs[attr]

        clean_html = '''
        <html>
            <head>
                <title>{0}</title>
            </head>
            <body>
                <h1>{1}</h1>
                {2}
            </body>
        </html>
        '''.format(str(self.title), str(self.title), str(article))

        return clean_html

    @staticmethod
    def to_df(documents):
        df = pd.DataFrame({
            "id": list(map(lambda x: x.id, documents)),
            "original_id": list(map(lambda x: x.original_id, documents)),
            "title": list(map(lambda x: x.title, documents)),
            "markdown": list(map(lambda x: x.markdown, documents)),
        })

        return df

    @staticmethod
    def lookup(id):
        return id % 1000000007

In [21]:
# Example

# filename = "43000621320-bagaimana-cara-bayar-vidio-dengan-kredivo-.html"
# d = None
# with open(f"./data/crawl/{filename}", "r") as f:
#     id = re.search("([0-9]+)", filename).group(1)
#     d = Document(id, f.read())
# d.clean_html

# with open(f'./data/output_clean.html', 'w+') as f:
#     f.write(d.clean_html)

In [3]:
directory = os.fsencode('./data/crawl/')
documents = []    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if not filename.endswith(".html"):
        continue

    with open(f"./data/crawl/{filename}", "r") as f:
        id = re.search("([0-9]+)", filename).group(1)
        document = Document(id, f.read())
        documents.append(document)


In [27]:
# for document in documents:
#     with open(f'./data/clean_html/{document.original_id}.html', 'w+') as f:
#         f.write(document.clean_html)

In [46]:
# import importlib
# importlib.reload(aiohttp_retry)

import aiohttp
import aiohttp_retry
import asyncio
import json

async def fetch(session, content):
    MODEL_ID="7738653107357220864"
    PROJECT_ID="328583281153"
    # ACCESS_TOKEN=$(gcloud auth print-access-token)
    ACCESS_TOKEN="ya29.a0Ad52N3_RHPR30g_JFHNmORi-vpZM7qqOYJFwYJs4kuNygXob7AkcuACF0Flj9NOyGpAYmKmNvCylNlcy5w2bClirCPvaPBvoKIUx082Ve2SDFCSNmoJFef6LAB_iCV-QjWICw_AhQAUaa782ynj66ZB1pCLMdv8YwIISEMJs2QkaCgYKASgSARISFQHGX2Mi3Li-zaSpaqE0u4eMrcTFQA0178"
    URL=f"https://asia-southeast1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/asia-southeast1/endpoints/{MODEL_ID}:predict"
    headers = {
        'Authorization': f'Bearer {ACCESS_TOKEN}'
    }
    payload = {
        'instances': [
            {
                'content': content,
                'task_type': 'DEFAULT',
                'title': ''
            }
        ]
    }
    async with session.post(URL,json=payload, headers=headers, retry_options=aiohttp_retry.ExponentialRetry(attempts=30,max_timeout=10, exceptions=[asyncio.exceptions.TimeoutError])) as response:
        res = await response.text()
        print('.',end='')

        return res

async def async_embed(contents):
    tasks = []
    # timeout = aiohttp.ClientTimeout(total=10)
    connector = aiohttp.TCPConnector(limit_per_host=2)
    # async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
    async with aiohttp.ClientSession(connector=connector) as session:
        retry_client = aiohttp_retry.RetryClient(session)
        for content in contents:
            tasks.append(fetch(retry_client, content))
        
        raw_res = []

        # try:
        raw_res = await asyncio.gather(*tasks)
        # except asyncio.exceptions.TimeoutError:
        #     print("timeout")
        #     print(raw_res)

        return list(map(lambda x: json.loads(x)['predictions'][0], raw_res))

async def embedding_texts_dataframe_async(df, column="search_text"):
    df.reset_index(drop=True, inplace=True)
    texts = df[column].values.tolist()
    embeds = await async_embed(texts)
    df['embedding'] = df.apply(lambda row : embeds[row.name], axis=1)
    

In [28]:
aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
model = aiplatform.Endpoint("7738653107357220864")
tqdm_notebook.pandas()

def embedding_text(model, text):
    prediction = model.predict(instances=[{
        "content": text,
        "task_type": "DEFAULT",
        "title": ""
    }])
    for embedding in prediction.predictions:
        vector = embedding
    return vector

def embedding_texts(model, texts):
    instances = [{"content": text, "task_type": "DEFAULT", "title": ""} for text in texts]
    prediction = model.predict(instances=instances)
    embeddings = [embedding for embedding in prediction.predictions]
    return list(map(lambda x: x.values, embeddings))

def embedding_texts_dataframe(df, column="search_text"):
    df.reset_index(drop=True, inplace=True)
    CHUNK_SIZE = 1
    embeds = []
    for i in tqdm_notebook(range(0,df.shape[0],CHUNK_SIZE)):
        end_idx = i+CHUNK_SIZE
        if end_idx > len(df.index):
            end_idx = len(df.index)
        curr = df[i:end_idx]
        texts = curr[column].values.tolist()
        embeds = embeds + embedding_texts(model, texts)
    print(embeds)
    df['embedding'] = df.apply(lambda row : embeds[row.name], axis=1)

def df_id_in(df, result):
    ids = result[0]
    distances = result[1]
    # df['order'] = df['id'].apply(lambda x: lst.index(x) if x in lst else -1)
    df['distance'] = df['id'].apply(lambda x: distances[ids.index(x)] if x in ids else -1)
    df = df[df['id'].isin(ids)]
    # return df.sort_values(['order']).drop(columns=['order'])
    return df.sort_values(['distance'], ascending=False)

        

In [6]:
df = Document.to_df(documents)
# df['embedding'] = df.progress_apply(lambda row: embedding_text(model, row['markdown']), axis=1)


In [47]:
await embedding_texts_dataframe_async(df, column="markdown")

..................................................................................................................................................................................................................................................................................

In [48]:
df.to_json('./data/faq.json', orient='records', lines=True)

In [32]:
annoy_index = AnnoyIndex(768, 'dot')
for index, row in df.iterrows():
    try:
        annoy_index.add_item(row['id'], row['embedding'])
    except:
        print(row['id'])
annoy_index.build(10)

True

In [33]:
# len(set(list(map(lambda x: x.id, documents)))) == len(set(list(map(lambda x: int(x.id) % 1000000007, documents))))

In [34]:
query =  "berapa harga 1 bulan"
query_vector = embedding_text(model, query)
results = annoy_index.get_nns_by_vector(query_vector, 10, search_k=-1, include_distances=True)
df_id_in(df, results)

Unnamed: 0,id,original_id,title,markdown,embedding,distance
270,714855,43000715156,Apakah pelanggan bisa mendapatkan promo lebih ...,Apakah pelanggan bisa mendapatkan promo lebih ...,"[-0.0217650775, 0.0114736678, 0.0216440652, 0....",0.833949
104,711984,43000712285,Apakah paket ini diperpanjang secara otomatis?,Apakah paket ini diperpanjang secara otomatis?...,"[-0.000632088806, 0.043326553, 0.0500438027, 0...",0.824979
207,717156,43000717457,Kapan periode promo berlangsung?,Kapan periode promo berlangsung?\nPeriode prom...,"[-0.0142186768, 0.0190491825, 0.0412636213, 0....",0.81307
159,711983,43000712284,Kapan paket ini mulai bisa dibeli ?,Kapan paket ini mulai bisa dibeli ?\nPaket ini...,"[0.00595653383, 0.0315996706, 0.0363692753, 0....",0.811088
107,713089,43000713390,Jika saya memiliki paket aktif dengan metode b...,Jika saya memiliki paket aktif dengan metode b...,"[0.0120332958, -0.000907549576, 0.022753777, 0...",0.808975
100,713088,43000713389,Jika saya berlangganan paket Platinum dan dipe...,Jika saya berlangganan paket Platinum dan dipe...,"[0.0115700942, 0.0321559, -0.00140187668, 0.02...",0.771587
187,713085,43000713386,Kapan harga paket Vidio mengalami kenaikan harga?,Kapan harga paket Vidio mengalami kenaikan har...,"[-0.00708315615, 0.0188244786, 0.0191431418, 0...",0.769741
13,660644,43000660945,Sampai kapan bonus ini bisa saya dapatkan?,Sampai kapan bonus ini bisa saya dapatkan?\nBo...,"[-0.00216950523, 0.013003896, 0.0389443077, 0....",0.765436
186,711219,43000711520,Berapa lama masa aktif paket HSS Berhadiah?,Berapa lama masa aktif paket HSS Berhadiah?\nP...,"[0.00856057275, 0.0349913202, 0.0567499585, 0....",0.764377
146,706241,43000706542,FAQ Perubahan Harga Diamond Package,FAQ Perubahan Harga Diamond Package\nQ : Kapan...,"[0.0308698397, 0.0182314888, 0.0051055993, 0.0...",0.764012
