In [1]:
import pdfplumber
import re
import nltk
from nltk.corpus import stopwords

import csv

from sentence_transformers import SentenceTransformer

import faiss
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Download NLTK data (you only need to run this once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Define a function to clean and preprocess text
def clean_text(text):
    """
    Cleans and preprocesses the text for vectorization.
    - Removes headers, footers, special characters, and unnecessary whitespace.
    - Tokenizes text into sentences.
    - Removes stop words.
    """
    # Remove headers
    text = re.sub(r'CareerSeva.Com\'s Destiny Designers: A Comprehensive Guide to Career Counseling Entrepreneurship in India - 2023*|CareerSeva.Com -A2Z Career Guidance & Planning    © Sfurti Media Production, Pune.2023*', '', text)

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Remove stop words
    # stop_words = set(stopwords.words('english'))
    processed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        filtered_words = [word for word in words]
        processed_sentences.append(' '.join(filtered_words))
    
    # Return cleaned sentences as a single string
    return ' '.join(processed_sentences)

In [5]:
# Define a function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file while handling images, tables, bullet points, headers, and footers.
    """
    extracted_text = ""
    
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page
        for page in pdf.pages:
            # Extract text from the page
            page_text = page.extract_text()
            
            # Skip pages with no text
            if not page_text:
                continue
            
            # Clean the extracted text
            cleaned_text = clean_text(page_text)
            
            # Append cleaned text
            extracted_text += cleaned_text + " "
    
    return extracted_text.strip()

In [9]:
def remove_footer(text):
    pattern = r"CareerSeva.Com -A2Z Career Guidance & Planning © Sfurti Media Production , Pune.2023 Page \d+"
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

In [6]:
# def remove_numeric_tokens(tokens_string):
#     tokens = tokens_string.split()
#     filtered_tokens = [token for token in tokens if not token.isdigit()]
#     result_string = ' '.join(filtered_tokens)
#     return result_string

In [7]:
# def remove_tokens_between(tokens_string):
#     """
#     Remove page containing table of contents
#     """
#     # Split the space-separated string into a list of tokens
#     tokens = tokens_string.split()
    
#     # Initialize flags for tracking token range
#     within_range = False
#     filtered_tokens = []
#     count = 0
    
#     for token in tokens:
#         if token == "-:Table":
#             within_range = True
#             count += 1
        
#         if not within_range:
#             filtered_tokens.append(token)
        
#         if token == "153":
#             within_range = False
#             count -= 1
    
#     # Join the filtered tokens back into a space-separated string
#     cleaned_tokens_string = ' '.join(filtered_tokens)
    
#     #print(count)
#     return cleaned_tokens_string

In [10]:
# Path to your PDF file
pdf_path = 'Dataset.pdf'

# Extract and preprocess text from the PDF
processed_text = extract_text_from_pdf(pdf_path)

# Remove Footer
processed_text = remove_footer(processed_text)

# # Remove Table of Contents page
# processed_text = remove_tokens_between(processed_text)

# # Remove number tokens
# processed_text = remove_numeric_tokens(processed_text)

# Print a preview of the processed text
print(processed_text[:1000])  # Display the first 500 characters to verify

CareerSeva.Com 's E-Book Edition Destiny Designers 2023 A Comprehensive Guide to Career Counseling Entrepreneurship in India If You Are A Teacher , Any Working or Self-Employed Professional , Digital Content Creator , Housewife Or Simply Any Graduate & Want To Make A Rewarding Career In Career Counseling ? Begin Your Journey Here ....  Unlocking The Secrets Of Successful Career Counseling For Future Educators And Coaches Author / Editor CareerSeva.com 's Expert Career Counselors & Content Creation Team E-Book MRP : INR 299/- Published By : Sfurti Media Production , Pune  Preface In the fluid tapestry of life , where dreams interweave with destiny , and aspirations intersect with reality , stands a beacon of guidance and wisdom : the career counselor . As we stand at the precipice of 2023 , an era shaped by technological leaps , environmental challenges , and socio- economic transformations , the role of the career counselor has never been more critical , nor their task more intricate .

In [18]:
def save_tokens_to_csv(tokens_string, filename):
    tokens = tokens_string.split()
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        for token in tokens:
            writer.writerow([token])  # Write each token on a new line

filename = "tokens.csv"
save_tokens_to_csv(processed_text, filename)

In [19]:
# Load tokens from CSV file
def load_tokens_from_csv(filename):
    tokens = []
    with open(filename, mode='r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:  # Ensure row is not empty
                tokens.append(row[0])  # Each token is stored in its own row
    return tokens

# Path to the CSV file containing tokens
filename = "tokens.csv"

# Load tokens
tokens = load_tokens_from_csv(filename)

# Initialize the multi-qa-mpnet-base-dot-v1 model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Encode each token into an embedding
embeddings = model.encode(tokens)

# Convert the embeddings to a numpy array
embeddings = np.array(embeddings)

# Step 2: Save the Embeddings in a FAISS Database

# Get the dimension of the embeddings
dimension = embeddings.shape[1]

# Initialize a FAISS index with L2 (Euclidean) distance
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the FAISS index
index.add(embeddings)

# Save the FAISS index to disk
faiss.write_index(index, 'tokens_faiss_index.index')

print(f"FAISS index saved with {index.ntotal} tokens.")

FAISS index saved with 53606 tokens.


In [21]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import nltk
import csv

# Load the FAISS index
index = faiss.read_index('tokens_faiss_index.index')

# Load the tokens from CSV again for reference
def load_tokens_from_csv(filename):
    tokens = []
    with open(filename, mode='r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:
                tokens.append(row[0])
    return tokens

tokens = load_tokens_from_csv("tokens.csv")

# Load the Sentence Transformer model
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Load the extractive QA model (using a model fine-tuned on SQuAD 2.0)
qa_model = pipeline('question-answering', model='deepset/roberta-base-squad2')

# Function to split the text into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Load the full text (concatenated tokens)
full_text = ' '.join(tokens)

# Split the full text into sentences
sentences = split_into_sentences(full_text)

# Create embeddings for each sentence
sentence_embeddings = embedding_model.encode(sentences)

# Initialize a new FAISS index for sentences
dimension = sentence_embeddings.shape[1]
sentence_index = faiss.IndexFlatL2(dimension)
sentence_index.add(np.array(sentence_embeddings))

def get_relevant_context(question, k=5):
    """
    Retrieve the most relevant context (sentences) for a given question using FAISS.
    :param question: The user's question as a string.
    :param k: Number of most similar sentences to retrieve.
    :return: A combined string of the most relevant sentences.
    """
    # Encode the question using the same embedding model
    question_embedding = embedding_model.encode([question])

    # Search for the k most similar sentences in the FAISS index
    distances, indices = sentence_index.search(np.array(question_embedding).astype('float32'), k)
    
    # Retrieve the most relevant sentences
    relevant_sentences = [sentences[idx] for idx in indices[0]]
    
    # Combine the relevant sentences into a single context for QA
    combined_context = ' '.join(relevant_sentences)
    
    return combined_context

def answer_question(question):
    """
    Answer a question using extractive QA with context retrieved from FAISS.
    :param question: The user's question as a string.
    :return: The answer to the question.
    """
    # Get the most relevant context from the FAISS index
    context = get_relevant_context(question)
    
    # Use the extractive QA model to find the answer within the retrieved context
    answer = qa_model(question=question, context=context)
    
    return answer['answer']

# Example usage:
question = "What is career counseling?"
answer = answer_question(question)
print(f"Question: {question}\nAnswer: {answer}")


Question: What is career counseling?
Answer: a journey of self-discovery and understanding the market demand


'\nquery = "What are the best career options in technology?"\nrelevant_sections = retrieve_relevant_sections(query, model, index, sentences, top_n=3)\nprint("Top relevant sections:", relevant_sections)'




Combined text length (in words): 35
Generated 1 chunks:
Chunk 1 (Length: 35 words): Career Options Suggested : 1 . Suggested Career Paths : 1 . Career Options Presented : 1 . Invest in Continuous Learning : The career landscape changes rapidly . Consider post-graduate studies for spe...


Batches: 100%|██████████| 1/1 [00:00<00:00, 10.99it/s]


Most Relevant Chunk: Career Options Suggested : 1 . Suggested Career Paths : 1 . Career Options Presented : 1 . Invest in Continuous Learning : The career landscape changes rapidly . Consider post-graduate studies for specialization .
Answer: Career Options Suggested : 1 . Suggested Career Paths
