In [232]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import markdownify 
from IPython.display import display, Markdown
import re
import os
import pandas as pd

In [237]:
class Document:
    def __init__(self, id, html):
        self.html = html
        self.soup = BeautifulSoup(self.html)
        self.title = self.process_title()
        try:
            self.markdown = self.process_markdown()
        except:
            print(id)
            raise
        self.clean_html = self.process_clean_html()
        self.id = self.lookup(int(id))
        self.original_id = str(id)
    
    def process_title(self):
        title = self.soup.find("title").string.replace(" | Support Vidio", "")
        return title

    def process_markdown(self):
        article = self.soup.find(id='article-body')
        tag_list = article.findAll(lambda tag: len(tag.attrs) > 0)
        for t in tag_list:
            deleted = []
            for attr in t.attrs:
                if attr not in ["src", "href"]:
                    deleted.append(attr)
            for attr in deleted:
                del t.attrs[attr]


        md = markdownify.markdownify(str(self.title) + str(article), heading_style="ATX")
        return md
    
    def process_clean_html(self):
        article = self.soup.find(id='article-body')
        tag_list = article.findAll(lambda tag: len(tag.attrs) > 0)
        for t in tag_list:
            deleted = []
            for attr in t.attrs:
                if attr not in ["src", "href"]:
                    deleted.append(attr)
            for attr in deleted:
                del t.attrs[attr]

        clean_html = '''
        <html>
            <head>
                <title>{0}</title>
            </head>
            <body>
                <h1>{1}</h1>
                {2}
            </body>
        </html>
        '''.format(str(self.title), str(self.title), str(article))

        return clean_html

    @staticmethod
    def to_df(documents):
        df = pd.DataFrame({
            "id": list(map(lambda x: x.id, documents)),
            "original_id": list(map(lambda x: x.original_id, documents)),
            "title": list(map(lambda x: x.title, documents)),
            "markdown": list(map(lambda x: x.markdown, documents)),
        })

        return df

    @staticmethod
    def lookup(id):
        return id % 1000000007

In [231]:
filename = "43000621320-bagaimana-cara-bayar-vidio-dengan-kredivo-.html"
d = None
with open(f"./data/crawl/{filename}", "r") as f:
    id = re.search("([0-9]+)", filename).group(1)
    d = Document(id, f.read())
d.clean_html

with open(f'./data/output_clean.html', 'w+') as f:
    f.write(d.clean_html)

In [239]:
directory = os.fsencode('./data/crawl/')
documents = []    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if not filename.endswith(".html"):
        continue

    with open(f"./data/crawl/{filename}", "r") as f:
        id = re.search("([0-9]+)", filename).group(1)
        document = Document(id, f.read())
        documents.append(document)


In [242]:
for document in documents:
    with open(f'./data/clean_html/{document.original_id}.html', 'w+') as f:
        f.write(document.clean_html)

In [337]:
import aiohttp
import asyncio
import json

async def fetch(session, content):
    MODEL_ID="textembedding-gecko-multilingual"
    PROJECT_ID="328583281153"
    # ACCESS_TOKEN=$(gcloud auth print-access-token)
    ACCESS_TOKEN="ya29.a0Ad52N3_PlpD4BV0l4YTjbcRYrAmO9kSLo1qe5XojE3ZZoxC2eHj6DzKiBt3CDaOHsM_Hvumxnqn0Myhbv8j2-VKEXIs0Yh8-Yqt9BAIDQClvSREUKMkrtgRalc3rQ4DYkZTKtRiNRxg2MhpCrdan84jMXJFkNVYWjPtf3nGF3qUaCgYKAeUSARISFQHGX2MiIeUEi39GZ3aa4CA0mQECIA0178"
    URL=f"https://asia-southeast1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/us-central1/publishers/google/models/{MODEL_ID}:predict"
    headers = {
        'Authorization': f'Bearer {ACCESS_TOKEN}'
    }
    payload = {
        'instances': [
            {
                'content': content
            }
        ]
    }
    async with session.post(URL,json=payload, headers=headers) as response:
        return await response.text()

async def async_embed(contents):
    tasks = []
    async with aiohttp.ClientSession() as session:
        for content in contents:
            tasks.append(fetch(session, content))
        raw_res = await asyncio.gather(*tasks)
        return list(map(lambda x: json.loads(x)['predictions'][0]['embeddings']['values'], raw_res))



In [344]:

from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
from tqdm._tqdm_notebook import tqdm_notebook

aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
tqdm_notebook.pandas()

model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual")

def embedding_text(model, text):
    embeddings = model.get_embeddings([text])
    for embedding in embeddings:
        vector = embedding.values
        # print(f"Length of Embedding Vector: {len(vector)}")
    return vector

def embedding_texts(model, texts):
    # inputs = list(map(lambda x: TextEmbeddingInput(text=x), texts))
    # embeddings = model.get_embeddings(inputs)
    embeddings = model.get_embeddings(texts)
    return list(map(lambda x: x.values, embeddings))

async def embedding_texts_dataframe_async(df, column="search_text"):
    df.reset_index(drop=True, inplace=True)
    texts = df[column].values.tolist()
    embeds = await async_embed(texts)
    df['embedding'] = df.apply(lambda row : embeds[row.name], axis=1)

# error = []
def embedding_texts_dataframe(df, column="search_text"):
    df.reset_index(drop=True, inplace=True)
    CHUNK_SIZE = 2
    embeds = []
    for i in tqdm_notebook(range(0,df.shape[0],CHUNK_SIZE)):
        end_idx = i+CHUNK_SIZE
        if end_idx > len(df.index):
            end_idx = len(df.index)
        curr = df[i:end_idx]
        texts = curr[column].values.tolist()
        embeds = embeds + embedding_texts(model, texts)
        # try:
        #     embeds = embeds + embedding_texts(model, texts)
        # except:
        #     print(i, ":", end_idx)
        #     error = texts
        #     raise

    df['embedding'] = df.apply(lambda row : embeds[row.name], axis=1)

def df_id_in(df, result):
    ids = result[0]
    distances = result[1]
    # df['order'] = df['id'].apply(lambda x: lst.index(x) if x in lst else -1)
    df['distance'] = df['id'].apply(lambda x: distances[ids.index(x)] if x in ids else -1)
    df = df[df['id'].isin(ids)]
    # return df.sort_values(['order']).drop(columns=['order'])
    return df.sort_values(['distance'], ascending=False)

        

In [289]:
df = Document.to_df(documents)
embedding_texts_dataframe(df, column="markdown")
df.to_json('./data/faq_embed.json', orient='records', lines=True)

  0%|          | 0/137 [00:00<?, ?it/s]

In [339]:
df2 = df.copy()

In [345]:
await embedding_texts_dataframe_async(df2, column="markdown")

In [346]:
df2

Unnamed: 0,id,original_id,title,markdown,embedding,distance
0,714399,43000714700,Saya sudah punya paket langganan Vidio Diamond...,Saya sudah punya paket langganan Vidio Diamond...,"[0.03136231750249863, 0.027689173817634583, 0....",-1.0
1,714401,43000714702,Tipe STB apa saja yang dapat mengakses layanan...,Tipe STB apa saja yang dapat mengakses layanan...,"[0.0014358796179294586, 0.0077969469130039215,...",-1.0
2,714853,43000715154,Sampai kapan periode promo First Media?,Sampai kapan periode promo First Media?\nPromo...,"[0.0008621924789622426, 0.0022952023427933455,...",-1.0
3,60022,43000060323,Bagaimana cara mengunduh tayangan di Vidio?,Bagaimana cara mengunduh tayangan di Vidio?\nV...,"[-0.0006356259109452367, -0.008756340481340885...",-1.0
4,712923,43000713224,Syarat dan Ketentuan Kuis Cinta Setelah Cinta,Syarat dan Ketentuan Kuis Cinta Setelah Cinta\...,"[0.009626702405512333, -0.032651010900735855, ...",-1.0
...,...,...,...,...,...,...
269,720959,43000721260,Saya sudah membeli paket PL Mobile via Shopee ...,Saya sudah membeli paket PL Mobile via Shopee ...,"[0.014896291308104992, -0.005151101853698492, ...",-1.0
270,717329,43000717630,Apakah saya akan dikenakan kuota jika menyaksi...,Apakah saya akan dikenakan kuota jika menyaksi...,"[-0.010666944086551666, -0.01846224069595337, ...",-1.0
271,656674,43000656975,Bagaimana memecahkan kendala audio pada perang...,Bagaimana memecahkan kendala audio pada perang...,"[-0.014890373684465885, -0.013640950433909893,...",-1.0
272,59908,43000060209,Bagaimana cara menghubungi Support Vidio?,Bagaimana cara menghubungi Support Vidio?\nCar...,"[-0.00494557898491621, -0.03522136062383652, 0...",-1.0


In [347]:
df

Unnamed: 0,id,original_id,title,markdown,embedding,distance
0,714399,43000714700,Saya sudah punya paket langganan Vidio Diamond...,Saya sudah punya paket langganan Vidio Diamond...,"[0.03129935637116432, 0.02767733484506607, 0.0...",-1.0
1,714401,43000714702,Tipe STB apa saja yang dapat mengakses layanan...,Tipe STB apa saja yang dapat mengakses layanan...,"[0.0014019496738910675, 0.007970788516104221, ...",-1.0
2,714853,43000715154,Sampai kapan periode promo First Media?,Sampai kapan periode promo First Media?\nPromo...,"[0.0008715722942724824, 0.002215511631220579, ...",-1.0
3,60022,43000060323,Bagaimana cara mengunduh tayangan di Vidio?,Bagaimana cara mengunduh tayangan di Vidio?\nV...,"[-0.0009788272436708212, -0.009444614872336388...",-1.0
4,712923,43000713224,Syarat dan Ketentuan Kuis Cinta Setelah Cinta,Syarat dan Ketentuan Kuis Cinta Setelah Cinta\...,"[0.009629127569496632, -0.032651305198669434, ...",-1.0
...,...,...,...,...,...,...
269,720959,43000721260,Saya sudah membeli paket PL Mobile via Shopee ...,Saya sudah membeli paket PL Mobile via Shopee ...,"[0.014955655671656132, -0.005131922662258148, ...",-1.0
270,717329,43000717630,Apakah saya akan dikenakan kuota jika menyaksi...,Apakah saya akan dikenakan kuota jika menyaksi...,"[-0.01062898151576519, -0.018987072631716728, ...",-1.0
271,656674,43000656975,Bagaimana memecahkan kendala audio pada perang...,Bagaimana memecahkan kendala audio pada perang...,"[-0.014711934141814709, -0.01374794915318489, ...",-1.0
272,59908,43000060209,Bagaimana cara menghubungi Support Vidio?,Bagaimana cara menghubungi Support Vidio?\nCar...,"[-0.005063499324023724, -0.035174500197172165,...",-1.0


In [None]:
# error = df[75:80]
# error = df[75:77]
error = df[77:80]
# error = df[75:76]
values = error['markdown'].values.tolist()
# values
e = embedding_texts(model, values)
e

In [292]:

df

Unnamed: 0,id,original_id,title,markdown,embedding
0,714399,43000714700,Saya sudah punya paket langganan Vidio Diamond...,Saya sudah punya paket langganan Vidio Diamond...,"[0.03129935637116432, 0.02767733484506607, 0.0..."
1,714401,43000714702,Tipe STB apa saja yang dapat mengakses layanan...,Tipe STB apa saja yang dapat mengakses layanan...,"[0.0014019496738910675, 0.007970788516104221, ..."
2,714853,43000715154,Sampai kapan periode promo First Media?,Sampai kapan periode promo First Media?\nPromo...,"[0.0008715722942724824, 0.002215511631220579, ..."
3,60022,43000060323,Bagaimana cara mengunduh tayangan di Vidio?,Bagaimana cara mengunduh tayangan di Vidio?\nV...,"[-0.0009788272436708212, -0.009444614872336388..."
4,712923,43000713224,Syarat dan Ketentuan Kuis Cinta Setelah Cinta,Syarat dan Ketentuan Kuis Cinta Setelah Cinta\...,"[0.009629127569496632, -0.032651305198669434, ..."
...,...,...,...,...,...
269,720959,43000721260,Saya sudah membeli paket PL Mobile via Shopee ...,Saya sudah membeli paket PL Mobile via Shopee ...,"[0.014955655671656132, -0.005131922662258148, ..."
270,717329,43000717630,Apakah saya akan dikenakan kuota jika menyaksi...,Apakah saya akan dikenakan kuota jika menyaksi...,"[-0.01062898151576519, -0.018987072631716728, ..."
271,656674,43000656975,Bagaimana memecahkan kendala audio pada perang...,Bagaimana memecahkan kendala audio pada perang...,"[-0.014711934141814709, -0.01374794915318489, ..."
272,59908,43000060209,Bagaimana cara menghubungi Support Vidio?,Bagaimana cara menghubungi Support Vidio?\nCar...,"[-0.005063499324023724, -0.035174500197172165,..."


In [294]:
final_df = df
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(768, 'dot')
for index, row in final_df.iterrows():
    try:
        annoy_index.add_item(int(row['id']) % 1000000007, row['embedding'])
    except:
        print(row['id'])
annoy_index.build(10)

True

In [200]:
int(43000664380)

43000664380

In [201]:
len(set(list(map(lambda x: x.id, documents))))

263

In [205]:

len(set(list(map(lambda x: int(x.id) % 1000000007, documents))))

263

In [295]:

query =  "berapa harga 1 bulan"
query_vector = embedding_text(model, query)
results = annoy_index.get_nns_by_vector(query_vector, 10, search_k=-1, include_distances=True)
df_id_in(df, results)

Unnamed: 0,id,original_id,title,markdown,embedding,distance
82,714855,43000715156,Apakah pelanggan bisa mendapatkan promo lebih ...,Apakah pelanggan bisa mendapatkan promo lebih ...,"[-0.02110324241220951, 0.01820087805390358, 0....",0.818332
49,711984,43000712285,Apakah paket ini diperpanjang secara otomatis?,Apakah paket ini diperpanjang secara otomatis?...,"[0.010040204040706158, 0.048084553331136703, 0...",0.812453
217,711983,43000712284,Kapan paket ini mulai bisa dibeli ?,Kapan paket ini mulai bisa dibeli ?\nPaket ini...,"[0.013964376412332058, 0.03291633725166321, 0....",0.789614
32,717156,43000717457,Kapan periode promo berlangsung?,Kapan periode promo berlangsung?\nPeriode prom...,"[-0.010691639967262745, 0.02130122110247612, 0...",0.789412
255,713089,43000713390,Jika saya memiliki paket aktif dengan metode b...,Jika saya memiliki paket aktif dengan metode b...,"[0.016423260793089867, 0.0029917797073721886, ...",0.783876
102,713088,43000713389,Jika saya berlangganan paket Platinum dan dipe...,Jika saya berlangganan paket Platinum dan dipe...,"[0.016638044267892838, 0.03356192260980606, 0....",0.749655
189,660644,43000660945,Sampai kapan bonus ini bisa saya dapatkan?,Sampai kapan bonus ini bisa saya dapatkan?\nBo...,"[-0.0007567215361632407, 0.01574697159230709, ...",0.74608
261,706241,43000706542,FAQ Perubahan Harga Diamond Package,FAQ Perubahan Harga Diamond Package\nQ : Kapan...,"[0.03095274232327938, 0.018538691103458405, 0....",0.744548
83,713086,43000713387,Berapa harga Platinum yang akan dibayarkan?,Berapa harga Platinum yang akan dibayarkan?\nH...,"[0.05155961588025093, -0.0009671734878793359, ...",0.744332
9,720953,43000721254,Apakah paket PL Mobile via Play Pass akan dipe...,Apakah paket PL Mobile via Play Pass akan dipe...,"[0.03137285262346268, 0.0291066262871027, 0.02...",0.744271
