In [1]:
import json
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from requests.exceptions import HTTPError
import time
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client =  Groq(api_key = os.environ['GROQ_API_KEY'])

In [4]:
with open('../data/vietnamese_rag/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [31]:
prompt_template = """
You emulate my assistant who works with me in a Q and A project .
Formulate 3 questions people might ask based on the record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. Make sure the questions should be in Vietnamese and the output can be parsed into json format.

The record:

question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3"] (no need to rewrite question1, question2, question3, just replace them with your question and make sure to answer immediately without any unrelated things like: "Here are the questions and answers in JSON format:")
""".strip()

In [46]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    # print(prompt)
    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model='llama3-groq-70b-8192-tool-use-preview',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results
def process_document(doc):
    doc_id = doc['id']
    if doc_id in results:
        return None

    questions = generate_questions(doc)
    return (doc_id, questions)
# Initialize ThreadPoolExecutor
pool = ThreadPoolExecutor(max_workers=6)

In [48]:
chunk_size = 40
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    # print(i + 42, chunk_start, chunk_end)
    chunk = documents[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")

    # Wait for 1 minute to reset rate limit
    time.sleep(25)

152


100%|██████████████████████████| 40/40 [00:26<00:00,  1.50it/s]


Chunk 0 processed and saved to ../data/vietnamese_rag/ground_truth_data/ground_truth1.pkl


100%|█████████████████████████| 40/40 [01:07<00:00,  1.69s/it]


Chunk 1 processed and saved to ../data/vietnamese_rag/ground_truth_data/ground_truth2.pkl


 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 30/40 [00:39<00:13,  1.30s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-groq-70b-8192-tool-use-preview` in organization `org_01j07w3a22fmv955d843svxcxt` on requests per minute (RPM): Limit 30, Used 30, Requested 1. Please try again in 1.282s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'requests', 'code': 'rate_limit_exceeded'}}

In [47]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents[0:2], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
print(results)

100%|████████████████████████████| 2/2 [00:00<00:00,  3.83it/s]

{'37b2c2d3': 'Minh Tú đã đạt được thành tích gì trong chương trình Asia’ Next Top Model mùa thứ 5?\nMinh Tú đã vượt qua sự sợ hãi để hoàn thành tốt phần thử thách đi catwalk khi bị treo lơ lửng trên một tòa nhà cao tầng và đạt vị trí thứ 2 trong đêm chung kết của chương trình.', '809411de': '["Tại sao sương mù xuất hiện dày đặc ở TP HCM vào sáng 21/9?", "Mức ô nhiễm của TP HCM có ảnh hưởng gì đến sương mù?", "Sương mù xuất hiện dày đặc ở TP HCM có thể được giảm thiểu bằng cách nào?"]'}



 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 28/40 [00:36<00:27,  2.30s/it]