## Install Packages

In [None]:
# !pip install langchain langchain-community langchain-openai llama-index openai chromadb deepeval tenacity
# !pip install rouge_score
# !pip install nltk bert-score transformers
# !pip install sentence-transformers datasets
# !pip install unidecode
# !pip install contractions
# !pip install langid

In [1]:
# Load the Drive helper and mount
import warnings
warnings.filterwarnings('ignore')
import os
import re
from bs4 import BeautifulSoup
import unidecode
import contractions
import time
import shutil
import uuid
import json
from tqdm.notebook import tqdm
import pandas as pd
from collections import defaultdict
from datasets import Dataset
from rouge_score import rouge_scorer

import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score
from transformers import pipeline
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')

import torch
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModel

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings

import chromadb
from chromadb.config import Settings

# LlamaIndex imports
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.core import Document as LlamaDocument
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode

# Set the environment variable for the Hugging Face cache directory
os.environ['TRANSFORMERS_CACHE'] = 'Model_Download'
os.environ['HF_HOME'] = 'Model_Download'

# Remove previous model save files from the specified location
model_download_path = 'Model_Download'
if os.path.exists(model_download_path):
    shutil.rmtree(model_download_path)

# Create the directory again
os.makedirs(model_download_path, exist_ok=True)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Set API keys
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [4]:
# Load TXT files
txt_paths = [
    "scraped_tabs/Capstone Projects.txt",
    "scraped_tabs/Career Outcomes.txt",
    "scraped_tabs/Course Progressions.txt",
    "scraped_tabs/Events & Deadlines.txt",
    "scraped_tabs/FAQs.txt",
    "scraped_tabs/Faculty, Instructors, Staff.txt",
    "scraped_tabs/How to Apply.txt",
    "scraped_tabs/In-Person Program.txt",
    "scraped_tabs/MS_ADS_Campus.txt",
    "scraped_tabs/Online Program.txt",
    "scraped_tabs/Our Students.txt",
    "scraped_tabs/Tuition_Fees_Aid.txt",
    "scraped_tabs/scraped_main_content.txt",
    "scraped_tabs/scraped_valid_links.txt"
]

# Load TXT files
loaders = [TextLoader(path) for path in txt_paths]

documents = []
for loader in loaders:
    documents.extend(loader.load())

[Bge-Large](https://huggingface.co/BAAI/bge-large-en-v1.5)

In [5]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

# Create custom embedding function using BAAI/bge-large-en-v1.5
model_name = "BAAI/bge-large-en-v1.5"
# Initialize tokenizer and model with the new model name
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_download_path)
model = AutoModel.from_pretrained(model_name, cache_dir=model_download_path)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

### b. Llama RAG

In [6]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Continue with creating LlamaIndex as before
llama_documents = [LlamaDocument.from_langchain_format(doc) for doc in splits]
llama_index = VectorStoreIndex.from_documents(llama_documents, embedding_func=get_embedding)
llama_query_engine = llama_index.as_query_engine()

### c. Query Results

In [7]:
def evaluate_rag_systems(query):
    start_time = time.time()
    llama_result = llama_query_engine.query(query)
    llama_time = time.time() - start_time

    print(f"Query: {query}\n")
    print(f"LlamaIndex Answer: {llama_result}")
    print(f"LlamaIndex Response Time: {llama_time:.2f} seconds\n")

    print("Evaluation:")

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    llama_rouge = scorer.score(query, str(llama_result))
    rouge_score = llama_rouge['rougeL'].fmeasure
    print(f"1. ROUGE-L F1 Score: LlamaIndex ({rouge_score:.4f})")

    reference = [word_tokenize(query.lower())]
    meteor_score_value = meteor_score([reference[0]], word_tokenize(str(llama_result).lower()))
    print(f"2. METEOR Score: LlamaIndex ({meteor_score_value:.4f})")

    _, _, llama_bert_f1 = bert_score([str(llama_result)], [query], lang="en", verbose=False)
    bert_score_value = llama_bert_f1[0].item()
    print(f"3. BERTScore (F1): LlamaIndex ({bert_score_value:.4f})")

    try:
        relevance_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
        llama_relevance = relevance_model({"text": query, "text_pair": str(llama_result)})

        relevance_score = llama_relevance[0]['score'] if isinstance(llama_relevance, list) else llama_relevance['score']
        print(f"4. Relevance Score: LlamaIndex ({relevance_score:.4f})")
    except Exception as e:
        print(f"4. Relevance Score: Error calculating relevance score - {str(e)}")
        relevance_score = None

    def hallucination_score(query, answer):
        query_words = set(query.lower().split())
        answer_words = set(answer.lower().split())
        new_words = answer_words - query_words
        return len(new_words) / len(answer_words)

    hallucination_score_value = hallucination_score(query, str(llama_result))
    print(f"5. Hallucination Score: LlamaIndex ({hallucination_score_value:.4f})")

    print("\n---\n")

    return {
        "Query": query,
        "Answer": str(llama_result),
        "Response Time": llama_time,
        "ROUGE-L F1 Score": rouge_score,
        "METEOR Score": meteor_score_value,
        "BERTScore": bert_score_value,
        "Relevance Score": relevance_score,
        "Hallucination Score": hallucination_score_value
    }

In [8]:
queries = [
    "What is tuition cost for the program?",
    "What scholarships are available for the program?",
    "What are the minimum scores for the TOEFL and IELTS English Language Requirement?",
    "Is there an application fee waiver?",
    "What are the deadlines for the in-person program?",
    "How long will it take for me to receive a decision on my application?",
    "Can I set up an advising appointment with the enrollment management team?",
    "Where can I mail my official transcripts?",
    "Does the Master’s in Applied Data Science Online program provide visa sponsorship?",
    "How do I apply to the MBA/MS program?",
    "Is the MS in Applied Data Science program STEM/OPT eligible?", 
    "How many courses must you complete to earn UChicago’s Master’s in Applied Data Science?"
]

In [9]:
# Save the result as TXT file

def save_results(results, file_format):
    file_path = f"Evaluation_Result/RAG Evaluation Result (Before Fine-Tune).{file_format}"

    if file_format == 'txt':
        with open(file_path, 'w') as txtfile:
            for result in results:
                for key, value in result.items():
                    txtfile.write(f"{key}: {value}\n")
                txtfile.write("\n---\n\n")

    print(f"Results saved to {file_path}")

In [10]:
# Evaluate all queries and store results
results = []
for query in queries:
    result = evaluate_rag_systems(query)
    results.append(result)

# Save results as a text file
save_results(results, 'txt')

Query: What is tuition cost for the program?

LlamaIndex Answer: The tuition cost for the program is not explicitly provided in the context information.
LlamaIndex Response Time: 1.36 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.4762)
2. METEOR Score: LlamaIndex (0.5875)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.9047)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (0.9918)
5. Hallucination Score: LlamaIndex (0.5833)

---

Query: What scholarships are available for the program?

LlamaIndex Answer: Merit scholarships are available for eligible applicants. Once you apply to the program, you will be automatically considered for a scholarship. Early applications are highly encouraged.
LlamaIndex Response Time: 1.05 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.3636)
2. METEOR Score: LlamaIndex (0.4180)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8918)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (0.9997)
5. Hallucination Score: LlamaIndex (0.7826)

---

Query: What are the minimum scores for the TOEFL and IELTS English Language Requirement?

LlamaIndex Answer: The minimum TOEFL score required is 102 total, and the minimum IELTS score required is 7.0 overall.
LlamaIndex Response Time: 1.91 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.3871)
2. METEOR Score: LlamaIndex (0.2759)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8991)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (0.9999)
5. Hallucination Score: LlamaIndex (0.5833)

---

Query: Is there an application fee waiver?

LlamaIndex Answer: Yes, there is an application fee waiver available for certain categories of applicants as outlined in the provided context information.
LlamaIndex Response Time: 1.14 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.3846)
2. METEOR Score: LlamaIndex (0.6618)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8934)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (1.0000)
5. Hallucination Score: LlamaIndex (0.7500)

---

Query: What are the deadlines for the in-person program?

LlamaIndex Answer: The deadlines for the in-person program are March 4, 2025 for the Second Priority Application Deadline, May 6, 2025 for the Third Priority Application Deadline, and June 23, 2025 for the Final Application Deadline.
LlamaIndex Response Time: 1.36 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.3182)
2. METEOR Score: LlamaIndex (0.4731)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8807)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (1.0000)
5. Hallucination Score: LlamaIndex (0.7619)

---

Query: How long will it take for me to receive a decision on my application?

LlamaIndex Answer: Typically, admissions decisions are released 1-2 months after each application deadline.
LlamaIndex Response Time: 0.69 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.1538)
2. METEOR Score: LlamaIndex (0.0676)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8681)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (0.9970)
5. Hallucination Score: LlamaIndex (1.0000)

---

Query: Can I set up an advising appointment with the enrollment management team?

LlamaIndex Answer: The enrollment management team does not handle advising appointments.
LlamaIndex Response Time: 0.73 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.3810)
2. METEOR Score: LlamaIndex (0.4637)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8976)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (0.9995)
5. Hallucination Score: LlamaIndex (0.5556)

---

Query: Where can I mail my official transcripts?

LlamaIndex Answer: Official transcripts should not be mailed as part of the admission application process. If offered admission, one official transcript for each university attended will be required at least one month prior to matriculation.
LlamaIndex Response Time: 1.02 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.1500)
2. METEOR Score: LlamaIndex (0.1852)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8583)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (0.9953)
5. Hallucination Score: LlamaIndex (0.9667)

---

Query: Does the Master’s in Applied Data Science Online program provide visa sponsorship?

LlamaIndex Answer: The Master’s in Applied Data Science Online Program does not provide visa sponsorship.
LlamaIndex Response Time: 0.85 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.8889)
2. METEOR Score: LlamaIndex (0.9226)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.9614)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (1.0000)
5. Hallucination Score: LlamaIndex (0.1538)

---

Query: How do I apply to the MBA/MS program?

LlamaIndex Answer: To apply to the MBA/MS program, applicants should complete the Chicago Booth Full-Time MBA application and select the MBA/MS in Applied Data Science as their program of interest. Additionally, an MBA/MS program supplement will be available for completion within the Booth application, which contains Applied Data Science specific questions. It is important to complete both the MBA application and the joint degree program supplement in the same application round for full consideration.
LlamaIndex Response Time: 1.59 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.1412)
2. METEOR Score: LlamaIndex (0.1572)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.8728)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (1.0000)
5. Hallucination Score: LlamaIndex (0.9167)

---

Query: Is the MS in Applied Data Science program STEM/OPT eligible?

LlamaIndex Answer: Yes, the MS in Applied Data Science program is STEM/OPT eligible.
LlamaIndex Response Time: 0.80 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.8696)
2. METEOR Score: LlamaIndex (0.8808)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.9603)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (1.0000)
5. Hallucination Score: LlamaIndex (0.1818)

---

Query: How many courses must you complete to earn UChicago’s Master’s in Applied Data Science?

LlamaIndex Answer: You must complete 12 courses to earn UChicago’s Master’s in Applied Data Science.
LlamaIndex Response Time: 0.76 seconds

Evaluation:
1. ROUGE-L F1 Score: LlamaIndex (0.7742)
2. METEOR Score: LlamaIndex (0.8336)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. BERTScore (F1): LlamaIndex (0.9618)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


4. Relevance Score: LlamaIndex (1.0000)
5. Hallucination Score: LlamaIndex (0.1538)

---

Results saved to Evaluation_Result/RAG Evaluation Result (Before Fine-Tune).txt


### c. Generate Dataset

In [12]:
def clean_text(text):

    # Remove extra whitespaces
    text = ' '.join(text.split())

    # Convert accented characters to ASCII characters
    text = unidecode.unidecode(text)

    # Expand contractions
    text = contractions.fix(text)

    # Remove newline characters
    text = re.sub(r'(?:\n)', ' ', text)

    # Strip leading and trailing spaces
    text = text.strip()

    return text

In [13]:
# Define file paths
TRAIN_FILES = [
    "scraped_tabs/Capstone Projects.txt",
    "scraped_tabs/Career Outcomes.txt",
    "scraped_tabs/Course Progressions.txt",
    "scraped_tabs/Events & Deadlines.txt",
    "scraped_tabs/FAQs.txt",
    "scraped_tabs/Faculty, Instructors, Staff.txt",
    "scraped_tabs/How to Apply.txt"
]

VAL_FILES = [
    "scraped_tabs/In-Person Program.txt",
    "scraped_tabs/MS_ADS_Campus.txt",
    "scraped_tabs/Online Program.txt",
    "scraped_tabs/Our Students.txt",
    "scraped_tabs/Tuition_Fees_Aid.txt",
    "scraped_tabs/scraped_main_content.txt",
    "scraped_tabs/scraped_valid_links.txt"
]

# Define save directory
SAVE_DIR = 'train_test_df'


In [26]:
# Define file paths for saving datasets
TRAIN_CORPUS_FPATH = os.path.join(SAVE_DIR, 'train_corpus.json')
VAL_CORPUS_FPATH = os.path.join(SAVE_DIR, 'val_corpus.json')
TRAIN_QUERIES_FPATH = os.path.join(SAVE_DIR, 'train_queries.json')
TRAIN_RELEVANT_DOCS_FPATH = os.path.join(SAVE_DIR, 'train_relevant_docs.json')
VAL_QUERIES_FPATH = os.path.join(SAVE_DIR, 'val_queries.json')
VAL_RELEVANT_DOCS_FPATH = os.path.join(SAVE_DIR, 'val_relevant_docs.json')
TRAIN_DATASET_FPATH = os.path.join(SAVE_DIR, 'train_dataset.json')
VAL_DATASET_FPATH = os.path.join(SAVE_DIR, 'val_dataset.json')

In [27]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: clean_text(node.get_content(metadata_mode=MetadataMode.NONE)) for node in nodes}
    return corpus

In [28]:
def generate_queries(
    corpus,
    num_questions_per_chunk=2,
    prompt_template=None,
    verbose=False,
):
    llm = LlamaOpenAI(model='gpt-3.5-turbo')

    prompt_template = prompt_template or """\
    Context information is below.

    ---------------------
    {context_str}
    ---------------------

    Given the context information and not prior knowledge.
    generate only questions based on the below query.

    You are a Teacher/ Professor. Your task is to setup \
    {num_questions_per_chunk} questions for an upcoming \
    quiz/examination. The questions should be diverse in nature \
    across the document. Restrict the questions to the \
    context information provided."
    """

    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = clean_text(question)
            relevant_docs[question_id] = [node_id]
    return queries, relevant_docs

In [29]:
# Load and save corpus
train_corpus = load_corpus(TRAIN_FILES, verbose=True)
val_corpus = load_corpus(VAL_FILES, verbose=True)

with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(train_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

Loading files ['scraped_tabs/Capstone Projects.txt', 'scraped_tabs/Career Outcomes.txt', 'scraped_tabs/Course Progressions.txt', 'scraped_tabs/Events & Deadlines.txt', 'scraped_tabs/FAQs.txt', 'scraped_tabs/Faculty, Instructors, Staff.txt', 'scraped_tabs/How to Apply.txt']
Loaded 7 docs


Parsing nodes:   0%|          | 0/7 [00:00<?, ?it/s]

Parsed 25 nodes
Loading files ['scraped_tabs/In-Person Program.txt', 'scraped_tabs/MS_ADS_Campus.txt', 'scraped_tabs/Online Program.txt', 'scraped_tabs/Our Students.txt', 'scraped_tabs/Tuition_Fees_Aid.txt', 'scraped_tabs/scraped_main_content.txt', 'scraped_tabs/scraped_valid_links.txt']
Loaded 7 docs


Parsing nodes:   0%|          | 0/7 [00:00<?, ?it/s]

Parsed 315 nodes


In [30]:
# Generate and save queries
train_queries, train_relevant_docs = generate_queries(train_corpus)
val_queries, val_relevant_docs = generate_queries(val_corpus)

with open(TRAIN_QUERIES_FPATH, 'w+') as f:
    json.dump(train_queries, f)

with open(TRAIN_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(train_relevant_docs, f)

with open(VAL_QUERIES_FPATH, 'w+') as f:
    json.dump(val_queries, f)

with open(VAL_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(val_relevant_docs, f)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/315 [00:00<?, ?it/s]

In [31]:
# Create and save complete datasets
train_dataset = {
    'queries': train_queries,
    'corpus': train_corpus,
    'relevant_docs': train_relevant_docs,
}

val_dataset = {
    'queries': val_queries,
    'corpus': val_corpus,
    'relevant_docs': val_relevant_docs,
}

with open(TRAIN_DATASET_FPATH, 'w+') as f:
    json.dump(train_dataset, f)

with open(VAL_DATASET_FPATH, 'w+') as f:
    json.dump(val_dataset, f)

print("Dataset generation complete. Files saved in:", SAVE_DIR)

Dataset generation complete. Files saved in: train_test_df
