In [12]:
import pdfplumber
import re
import nltk
from nltk.corpus import stopwords

import csv

from sentence_transformers import SentenceTransformer

import faiss
import numpy as np

In [2]:
# Download NLTK data (you only need to run this once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Define a function to clean and preprocess text
def clean_text(text):
    """
    Cleans and preprocesses the text for vectorization.
    - Removes headers, footers, special characters, and unnecessary whitespace.
    - Tokenizes text into sentences.
    - Removes stop words.
    """
    # Remove headers and footers (assumption: they repeat every page)
    text = re.sub(r'CareerSeva.Com\'s Destiny Designers: A Comprehensive Guide to Career Counseling Entrepreneurship in India - 2023*|CareerSeva.Com -A2Z Career Guidance & Planning    © Sfurti Media Production, Pune.2023*', '', text)

    # Remove special characters and excessive whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    processed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        processed_sentences.append(' '.join(filtered_words))
    
    # Return cleaned sentences as a single string
    return ' '.join(processed_sentences)

In [4]:
# Define a function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file while handling images, tables, bullet points, headers, and footers.
    """
    extracted_text = ""
    
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page
        for page in pdf.pages:
            # Extract text from the page
            page_text = page.extract_text()
            
            # Skip pages with no text
            if not page_text:
                continue
            
            # Clean the extracted text
            cleaned_text = clean_text(page_text)
            
            # Append cleaned text
            extracted_text += cleaned_text + " "
    
    return extracted_text.strip()

In [5]:
def remove_footer(text):
    pattern = r"CareerSevaCom A2Z Career Guidance Planning Sfurti Media Production Pune2023 Page \d+"
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

In [6]:
def remove_numeric_tokens(tokens_string):
    tokens = tokens_string.split()
    filtered_tokens = [token for token in tokens if not token.isdigit()]
    result_string = ' '.join(filtered_tokens)
    return result_string

In [7]:
def remove_tokens_between(tokens_string):
    """
    Remove page containing table of contents
    """
    # Split the space-separated string into a list of tokens
    tokens = tokens_string.split()
    
    # Initialize flags for tracking token range
    within_range = False
    filtered_tokens = []
    count = 0
    
    for token in tokens:
        if token == "Table":
            within_range = True
            count += 1
        
        if not within_range:
            filtered_tokens.append(token)
        
        if token == "153":
            within_range = False
            count -= 1
    
    # Join the filtered tokens back into a space-separated string
    cleaned_tokens_string = ' '.join(filtered_tokens)
    
    #print(count)
    return cleaned_tokens_string

In [8]:
# Path to your PDF file
pdf_path = 'Dataset.pdf'

# Extract and preprocess text from the PDF
processed_text = extract_text_from_pdf(pdf_path)

# Remove Footer
processed_text = remove_footer(processed_text)

# Remove Table of Contents page
processed_text = remove_tokens_between(processed_text)

# Remove number tokens
processed_text = remove_numeric_tokens(processed_text)

# Print a preview of the processed text
print(processed_text[:500])  # Display the first 500 characters to verify

CareerSevaComs EBook Edition Destiny Designers Comprehensive Guide Career Counseling Entrepreneurship India Teacher Working SelfEmployed Professional Digital Content Creator Housewife Simply Graduate Want Make Rewarding Career Career Counseling Begin Journey Unlocking Secrets Successful Career Counseling Future Educators Coaches Author Editor CareerSevacoms Expert Career Counselors Content Creation Team EBook MRP INR Published Sfurti Media Production Pune Preface fluid tapestry life dreams inter


In [9]:
def save_tokens_to_csv(tokens_string, filename):
    tokens = tokens_string.split()
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(tokens)

filename = "tokens.csv"
save_tokens_to_csv(processed_text, filename)

In [10]:
# Load the model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1') # Chosen since it is recommended for QnA systems and trained on semantic results

sentences = nltk.sent_tokenize(processed_text)  # Tokenize the text into sentences

# Vectorize the sentences using the selected model
sentence_embeddings = model.encode(sentences, show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:03<00:00,  3.53s/it]


In [11]:
print("Generated Embeddings for QA Task:")
print(sentence_embeddings[:5])  # Print the first 5 embeddings to verify

Generated Embeddings for QA Task:
[[ 1.17667601e-01  2.72827625e-01 -3.90777200e-01 -2.75067508e-01
   2.74234056e-01 -7.43905678e-02  3.50089252e-01  2.12362513e-01
   4.69452925e-02  6.91179410e-02  1.38827920e-01  3.93895924e-01
   1.55625353e-02  2.12807357e-01  8.15760251e-03  4.95283306e-03
   5.16690165e-02  1.80346072e-01  5.47197461e-03 -4.97416928e-02
  -2.28070356e-02  7.05097988e-02  1.78358331e-02  3.27904463e-01
  -5.88320792e-02 -2.87404656e-01  2.69400895e-01  2.50371575e-01
  -4.97082844e-02 -1.37031823e-01 -1.36470608e-02  5.36916144e-02
  -1.90986499e-01 -5.61979413e-03 -9.72293274e-05 -6.03161156e-02
   5.71936928e-02 -1.60974592e-01 -3.33396614e-01 -5.22207618e-02
  -6.39481992e-02  1.17418274e-01 -2.85843730e-01 -1.46881565e-01
  -3.08612734e-02  3.47103477e-02  7.96274096e-03  1.37949735e-02
   2.04369992e-01  4.76509742e-02  3.84025306e-01 -2.79168993e-01
   1.70563802e-01 -1.59542546e-01 -1.09696470e-01  2.17482358e-01
  -1.36325270e-01 -3.32927108e-02  5.45635

In [15]:
# Convert the embeddings to a numpy array (required by FAISS)
embeddings_array = np.array(sentence_embeddings).astype('float32')

# Initialize the FAISS index
embedding_dimension = embeddings_array.shape[1]  # Dimensions of the embeddings (should be 768 for mpnet-base)
index = faiss.IndexFlatL2(embedding_dimension)  # L2 distance is used for similarity search

# Add embeddings to the FAISS index
index.add(embeddings_array)

# Save the FAISS index for future use
faiss.write_index(index, "faiss_index.bin")

# Also, save sentences for retrieving the context later
import pickle

with open('sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

print("Embeddings saved in FAISS index.")

Embeddings saved in FAISS index.


In [28]:
def chunk_text(text, max_length=400):
    """
    Chunks the text into smaller parts based on a maximum length.
    """
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0  # Reset current length for every new chunk

    # Create chunks by grouping sentences
    for sentence in sentences:
        # Clean up the sentence to avoid special character issues
        sentence = sentence.strip()

        # Split sentence into words
        sentence_length = len(sentence.split())

        if current_length + sentence_length > max_length:
            # If adding the current sentence exceeds max_length, finalize the current chunk
            if current_chunk:  # Ensure not adding empty chunks
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]  # Start a new chunk with the current sentence
            current_length = sentence_length  # Reset current length
        else:
            # Add sentence to the current chunk
            current_chunk.append(sentence)
            current_length += sentence_length  # Update the current length
    
    # Add the last chunk if it has any sentences
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    # Debugging: Print out the chunks to verify
    print(f"Generated {len(chunks)} chunks:")
    for i, chunk in enumerate(chunks[:5]):  # Show only the first 5 chunks for brevity
        print(f"Chunk {i+1} (Length: {len(chunk.split())} words): {chunk[:200]}...")  # Print the first 200 characters of each chunk
    
    return chunks



from sklearn.metrics.pairwise import cosine_similarity

def score_chunks(query, chunks, model):
    """
    Scores each chunk based on its relevance to the query using cosine similarity.
    """
    query_embedding = model.encode([query], show_progress_bar=False)
    chunk_embeddings = model.encode(chunks, show_progress_bar=True)

    # Compute cosine similarity scores
    scores = cosine_similarity(query_embedding, chunk_embeddings).flatten()
    
    # Pair chunks with their scores
    chunk_scores = list(zip(chunks, scores))
    
    # Sort by scores in descending order
    chunk_scores = sorted(chunk_scores, key=lambda x: x[1], reverse=True)
    
    return chunk_scores


In [29]:
# Load the FAISS index
index = faiss.read_index("faiss_index.bin")

# Load sentences
with open('sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)

# Function to retrieve top N relevant sections
def retrieve_relevant_sections(query, model, index, sentences, top_n=5):
    """
    Retrieves the top N relevant sections for a given query using FAISS.
    """
    # Encode the query using the same model
    query_embedding = model.encode([query])[0].astype('float32')

    # Search in the FAISS index
    distances, indices = index.search(np.array([query_embedding]), top_n)

    # Retrieve the corresponding sentences/sections
    relevant_sections = [sentences[idx] for idx in indices[0]]
    
    return relevant_sections

# Example usage
'''
query = "What are the best career options in technology?"
relevant_sections = retrieve_relevant_sections(query, model, index, sentences, top_n=3)
print("Top relevant sections:", relevant_sections)'''

'\nquery = "What are the best career options in technology?"\nrelevant_sections = retrieve_relevant_sections(query, model, index, sentences, top_n=3)\nprint("Top relevant sections:", relevant_sections)'

In [30]:
from transformers import pipeline

# Load the QA model pipeline from HuggingFace
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [31]:
def retrieve_and_answer(query, model, index, sentences, qa_pipeline, top_n=5, max_chunk_length=400):
    """
    Retrieves relevant sections, chunks them, scores them, and gets the answer using the QA model.
    """
    # Retrieve the top N relevant sections using FAISS
    relevant_sections = retrieve_relevant_sections(query, model, index, sentences, top_n)
    
    # Combine retrieved sections into one text and chunk them
    combined_text = " ".join(relevant_sections)

    print(f"Combined text length (in words): {len(combined_text.split())}")
    
    chunks = chunk_text(combined_text, max_length=max_chunk_length)
    
    # Score chunks for relevance
    chunk_scores = score_chunks(query, chunks, model)
    
    # Select the top chunk based on the score
    top_chunk = chunk_scores[0][0]  # Get the chunk with the highest relevance score
    
    # Use the QA model to find the answer in the most relevant chunk
    result = qa_pipeline({
        'question': query,
        'context': top_chunk
    })
    
    return result['answer'], top_chunk

# Example usage
query = "What are the best career options in technology?"
answer, top_chunk = retrieve_and_answer(query, model, index, sentences, qa_pipeline, top_n=5)

print("Most Relevant Chunk:", top_chunk)
print("Answer:", answer)


Combined text length (in words): 139560
Generated 1 chunks:
Chunk 1 (Length: 139560 words): CareerSevaComs EBook Edition Destiny Designers Comprehensive Guide Career Counseling Entrepreneurship India Teacher Working SelfEmployed Professional Digital Content Creator Housewife Simply Graduate ...


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]


KeyboardInterrupt: 