In [21]:
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
import json
import os
import re

load_dotenv()

True

In [17]:
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq

template = """
You are generating records for a given Subject to fine-tune an intent classification model. Each record should include a random context related to the ONLY input variable "Subject", user input, and the classified intent according to the context and user input based on the categories below:

- Valid answer: The user has provided a correct or appropriate response to a question or topic.
- Valid question: The user has asked a clear and relevant question related to the context or discussion.
- Irrelevant input: The user is providing an irrelevant answer or asking an out-of-context question.
- Boredom: The user is expressing a lack of interest, excitement, or engagement with the current topic or situation.
- Insult: The user is using offensive or derogatory language directed at someone or something, showing disrespect.
- Greeting: The user is initiating a conversation or interaction with a friendly or polite salutation, such as "Hello" or "Good morning."
- Clarification request: The user is asking for more information or a clearer explanation to better understand something they are confused about.

Some Instructions regarding Greeting: If a valid question,Valid answer, Irrelevant Input or a Clarification Request is followed by a Greeting, It belongs to the category which is followed by the greeting, Not for the "Greeting" category
---------------------------------------------------------------------------------------
Ex: 
{{
    "context": "Fossils are the preserved remains or imprints of ancient plants and animals. They provide important evidence for the history of life on Earth and are used to study the process of evolution. Fossils can be found in sedimentary rocks and are often used to date rocks and reconstruct ancient environments.",
    "user_input": "Hi, can you explain fossils to me?",
}}
The intent for above is a "Clarification request", Not a Greeting
---------------------------------------------------------------------------------------



Input Subject = {subject}

Generated Context should be a Description of not less than 50 words, that might be included in a textbook or a question paper.

Final Output in JSON format:

[
    {{
        "context": "",
        "user_input": "",
        "intent": ""
    }},
    {{
        "context": "",
        "user_input": "",
        "intent": ""
    }},
]

IMPORTANT: Final output should ONLY contain the JSON file, No other words 
"""

# Initialize the prompt template
prompt = PromptTemplate(
    input_variables=["subject"],
    template=template,
)


In [18]:
groq_model = ChatGroq(
    temperature=0.9,
    model="llama3-70b-8192",
)

In [19]:
final_prompt = prompt.format(subject = "Grade 12 Mathematics")
response = groq_model.invoke(final_prompt)
print(response.content)

[
    {
        "context": "In Grade 12 Mathematics, the concept of derivatives is introduced to measure the rate of change of a function. The derivative of a function f at a point x is denoted as f'(x) and represents the rate of change of the function with respect to x. Derivatives have numerous applications in physics, engineering, and economics, including optimization problems, motion along a line, and related rates.",
        "user_input": "What is the formula to find the derivative of a function?",
        "intent": "Valid question"
    },
    {
        "context": "The mathematical concept of trigonometry is essential in Grade 12 Mathematics, dealing with the relationships between the sides and angles of triangles. The unit circle is used to define the trigonometric functions sine, cosine, and tangent, which are crucial in solving triangular problems.",
        "user_input": "I don't understand why we need to learn trigonometry.",
        "intent": "Boredom"
    },
    {
        "

In [20]:
import json
import os
import re

def extract_json_blocks(text):
    """Extracts all JSON blocks from the provided text."""
    # Regular expression to match JSON arrays or objects
    json_pattern = re.compile(r'\[.*?\]|\{.*?\}', re.DOTALL)
    return json_pattern.findall(text)

def update_json_file(file_name: str, subject: str, iterations: int = 20):
    """Generates and updates the JSON file with new records from the groq_model."""
    
    # Check if the file already exists and is not empty
    if os.path.exists(file_name) and os.path.getsize(file_name) > 0:
        with open(file_name, 'r') as file:
            try:
                data = json.load(file)
            except json.JSONDecodeError:
                print(f"Error: Invalid JSON content in {file_name}. Initializing empty list.")
                data = []
    else:
        # If the file does not exist or is empty, initialize an empty list
        data = []
    
    # Iterate to generate and append new data
    for _ in range(iterations):
        # Generate the prompt for the model
        final_prompt = prompt.format(subject=subject)
        
        # Invoke the model to get the response
        response = groq_model.invoke(final_prompt)
        print(response.content)  # Output the content for debugging
        
        # Extract JSON blocks from the response content
        json_blocks = extract_json_blocks(response.content)
        
        # Process each extracted JSON block
        for json_block in json_blocks:
            try:
                # Parse each JSON block as a list or object
                new_records = json.loads(json_block)
                # If the block is a list, extend the data; if it's a single record, append
                if isinstance(new_records, list):
                    data.extend(new_records)
                else:
                    data.append(new_records)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    
    # Write the updated data back to the file
    with open(file_name, 'w') as file:
        json.dump(data, file, indent=4)
    
    print(f"Updated JSON file: {file_name} with {iterations} new records.")

# Example of running the function
update_json_file('data5.json', subject="University Level Acoustics physics", iterations=5)


[
    {
        "context": "In university-level acoustics physics, the study of sound waves and their properties is crucial for understanding various phenomena. Sound waves are mechanical waves that propagate through a medium, such as air, water, or solids, and can be described by their frequency, wavelength, and speed. The speed of sound is an important concept in acoustics, as it depends on the properties of the medium and can be affected by factors such as temperature, humidity, and air pressure.",
        "user_input": "What are the factors that affect the speed of sound?",
        "intent": "Valid question"
    },
    {
        "context": "Acoustic impedance is a fundamental concept in acoustics physics, referring to the opposition to the flow of sound energy through a medium. It is a complex quantity that depends on the properties of the medium, such as its density and elasticity. Understanding acoustic impedance is essential for analyzing and predicting the behavior of sound wav

KeyboardInterrupt: 

# RAG

In [23]:
# RAG
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import PyPDFDirectoryLoader



after_rag_template = """
You are generating records for a given Subject to fine-tune an intent classification model. Each record should include a random context related to the ONLY input variable "Subject", user input, and the classified intent according to the context and user input based on the categories below:

- Valid answer: The user has provided a correct or appropriate response to a question or topic.
- Valid question: The user has asked a clear and relevant question related to the context or discussion.
- Irrelevant input: The user is providing an irrelevant answer or asking an out-of-context question.
- Boredom: The user is expressing a lack of interest, excitement, or engagement with the current topic or situation.
- Insult: The user is using offensive or derogatory language directed at someone or something, showing disrespect.
- Greeting: The user is initiating a conversation or interaction with a friendly or polite salutation, such as "Hello" or "Good morning."
- Clarification request: The user is asking for more information or a clearer explanation to better understand something they are confused about.

Some Instructions regarding Greeting: If a valid question,Valid answer, Irrelevant Input or a Clarification Request is followed by a Greeting, It belongs to the category which is followed by the greeting, Not for the "Greeting" category
---------------------------------------------------------------------------------------
Ex: 
{{
    "context": "Fossils are the preserved remains or imprints of ancient plants and animals. They provide important evidence for the history of life on Earth and are used to study the process of evolution. Fossils can be found in sedimentary rocks and are often used to date rocks and reconstruct ancient environments.",
    "user_input": "Hi, can you explain fossils to me?",
}}
The intent for above is a "Clarification request", Not a Greeting.
---------------------------------------------------------------------------------------



Input Subject = {subject}

Generated Context should be a Description of not less than 50 words, A good context provides clear, concise background information to set the stage for understanding a concept or scenario, typically found in textbooks or exam questions

Final Output in JSON format:

[
    {{
        "context": "",
        "user_input": "",
        "intent": ""
    }},
    {{
        "context": "",
        "user_input": "",
        "intent": ""
    }},
]

IMPORTANT: Final output should ONLY contain the JSON file, No other words 
"""










In [31]:



faiss_index_path = 'eval'

# dir_loader = PyPDFDirectoryLoader('resources')

pdf_loader = PyMuPDFLoader('resources/grade-11-information-and-communication-technology-ict-text-book-6200f55f7b294.pdf')
documents = pdf_loader.load()
embedding_huggingface = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")



# Check if the FAISS index already exists
if os.path.exists(faiss_index_path):
    # Load the vectorstore from disk
    print("Loading existing vector store from disk...")
    vectorstore = FAISS.load_local(faiss_index_path, embedding_huggingface, allow_dangerous_deserialization=True)
else:

    if documents:
        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=200)
        split_documents = text_splitter.split_documents(documents)


        vectorstore = FAISS.from_documents(
            documents=split_documents,
            embedding=embedding_huggingface,
        )

        vectorstore.save_local(faiss_index_path)

retriever = vectorstore.as_retriever()





In [32]:

after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)

after_rag_chain = (
{"subject": retriever, "question": RunnablePassthrough()}
| after_rag_prompt
| groq_model
| StrOutputParser()
                    )

In [33]:
import json
import os
import re

def extract_json_blocks(text):
    """Extracts all JSON blocks from the provided text."""
    # Regular expression to match JSON arrays or objects
    json_pattern = re.compile(r'\[.*?\]|\{.*?\}', re.DOTALL)
    return json_pattern.findall(text)

def update_json_file_from_response(file_name: str, iterations:int):
    """
    Updates the JSON file with new records from the RAG response.
    
    Parameters:
    file_name (str): The name of the JSON file to update.
    response_content (str): The content from the groq_model, already generated in JSON format.
    """
    
    # Check if the file already exists and is not empty
    if os.path.exists(file_name) and os.path.getsize(file_name) > 0:
        with open(file_name, 'r') as file:
            try:
                data = json.load(file)
            except json.JSONDecodeError:
                print(f"Error: Invalid JSON content in {file_name}. Initializing empty list.")
                data = []
    else:
        # If the file does not exist or is empty, initialize an empty list
        data = []
    
    for _ in range(iterations):

        question = "Provide the output IN JSON FORMAT."

        response = after_rag_chain.invoke(question)
        print(response)

        # Extract JSON blocks from the response content
        json_blocks = extract_json_blocks(response)
        
        # Process each extracted JSON block
        for json_block in json_blocks:
            try:
                # Parse each JSON block as a list or object
                new_records = json.loads(json_block)
                # If the block is a list, extend the data; if it's a single record, append
                if isinstance(new_records, list):
                    data.extend(new_records)
                else:
                    data.append(new_records)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
        
        # Write the updated data back to the file
        with open(file_name, 'w') as file:
            json.dump(data, file, indent=4)
        
        print(f"Updated JSON file: {file_name} with new records.")

# Example usage with response content from the RAG pipeline
# response_content = response  # response generated from the RAG pipeline (from the previous code)
update_json_file_from_response('eval.json', iterations=30)

[
    {
        "context": "When designing a web page, it is essential to consider the organization and structure of the content. A site map can be used to visualize the relationships between different pages and help in planning the navigation. The home page should provide a summary of the entire content of the website and present information in brief, using numbered lists, bulleted lists, and indentation.",
        "user_input": "How can I make the home page more engaging?",
        "intent": "Valid question"
    },
    {
        "context": "HTML tags are used to define the structure and content of a web page. There are different types of tags, including headings, paragraphs, lists, and links. The <dl> tag is used to define a description list, which consists of terms and their descriptions.",
        "user_input": "What is the purpose of the <dl> tag?",
        "intent": "Valid question"
    },
    {
        "context": "A description list is a type of list that provides a description 

In [26]:
import os
import json
import pandas as pd

# Directory where the JSON files are stored
directory = "data"

# List to store the data from all files
all_data = []

# Iterate through each JSON file in the directory
for file_name in os.listdir(directory):
    if file_name.endswith(".json"):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as json_file:
            try:
                data = json.load(json_file)
                for entry in data:
                    # Check if entry is a dictionary before attempting to access keys
                    if isinstance(entry, dict):
                        # Append the data along with the file name to the list
                        all_data.append({
                            "json_file_name": file_name,
                            "context": entry.get("context", ""),
                            "user_input": entry.get("user_input", ""),
                            "intent": entry.get("intent", "")
                        })
                    else:
                        print(f"Skipping invalid entry in {file_name}: {entry}")
            except json.JSONDecodeError:
                print(f"Error decoding JSON from file: {file_name}")

# Create a DataFrame from the collected data
df = pd.DataFrame(all_data)
# Display the DataFrame to the user
df


Skipping invalid entry in data1.json: 0
Skipping invalid entry in data1.json: 1


Unnamed: 0,json_file_name,context,user_input,intent
0,data1.json,Solve for x in the equation 2x + 5 = 11. Show ...,x = 3,Valid answer
1,data1.json,"In a right-angled triangle, the length of the ...",Can I use the Pythagorean theorem?,Valid question
2,data1.json,The graph of a quadratic function opens upward...,I don't care about math,Boredom
3,data1.json,"In the diagram below, ABC is a triangle with a...",MN is parallel to AC because it is the midpoin...,Valid answer
4,data1.json,The sum of the interior angles of a polygon is...,What is a polygon again?,Clarification request
...,...,...,...,...
1240,data5.json,Sound waves can be classified into two main ca...,"That's so confusing, can you explain it again?",Clarification request
1241,data5.json,"The Doppler effect, a fundamental concept in a...","This is stupid, I'll never use this in real life.",Irrelevant input
1242,data5.json,"Acoustic resonance, a phenomenon in which a sy...",You're an idiot if you think this is important.,Insult
1243,data5.json,"The decibel, a unit of measurement for sound i...","Hi, I have a question about sound intensity.",Clarification request


In [30]:
import os
import json

# Directory where JSON files are stored
input_directory = 'data'
output_file = 'combined_.json'

# List to store all JSON data
combined_data = []

# Iterate through all files in the directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(input_directory, filename)
        
        # Open and load each JSON file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            # Append the data to the list
            combined_data.extend(data)

# Write combined data to output file
with open(output_file, 'w', encoding='utf-8') as output:
    json.dump(combined_data, output, indent=4)

print(f"Combined JSON data saved to {output_file}")


Combined JSON data saved to combined_.json
