### Import necessary libraries

In [2]:
import pdfplumber
import re
import nltk
import csv
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Download NLTK data (you only need to run this once)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Functions for preprocessing

In [3]:
# Define a function to clean and preprocess text
def clean_text(text):
    """
    Cleans and preprocesses the text for vectorization.
    - Removes headers, footers, special characters, and unnecessary whitespace.
    - Tokenizes text into sentences.
    - Removes stop words.
    """
    # Remove headers
    text = re.sub(r'CareerSeva.Com\'s Destiny Designers: A Comprehensive Guide to Career Counseling Entrepreneurship in India - 2023*|CareerSeva.Com -A2Z Career Guidance & Planning    © Sfurti Media Production, Pune.2023*', '', text)

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)

    # Remove stop words
    # stop_words = set(stopwords.words('english'))
    processed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        filtered_words = [word for word in words]
        processed_sentences.append(' '.join(filtered_words))

    # Return cleaned sentences as a single string
    return ' '.join(processed_sentences)

In [4]:
# Define a function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file while handling images, tables, bullet points, headers, and footers.
    """
    extracted_text = ""

    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page
        for page in pdf.pages:
            # Extract text from the page
            page_text = page.extract_text()

            # Skip pages with no text
            if not page_text:
                continue

            # Clean the extracted text
            cleaned_text = clean_text(page_text)

            # Append cleaned text
            extracted_text += cleaned_text + " "

    return extracted_text.strip()

In [5]:
def remove_footer(text):
    pattern = r"CareerSeva.Com -A2Z Career Guidance & Planning © Sfurti Media Production , Pune.2023 Page \d+"
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

### Preprocess the data

In [6]:
# Path to your PDF file
pdf_path = 'Dataset.pdf'

# Extract and preprocess text from the PDF
processed_text = extract_text_from_pdf(pdf_path)

# Remove Footer
processed_text = remove_footer(processed_text)

# Print a preview of the processed text
print(processed_text[:1000])

CareerSeva.Com 's E-Book Edition Destiny Designers 2023 A Comprehensive Guide to Career Counseling Entrepreneurship in India If You Are A Teacher , Any Working or Self-Employed Professional , Digital Content Creator , Housewife Or Simply Any Graduate & Want To Make A Rewarding Career In Career Counseling ? Begin Your Journey Here ....  Unlocking The Secrets Of Successful Career Counseling For Future Educators And Coaches Author / Editor CareerSeva.com 's Expert Career Counselors & Content Creation Team E-Book MRP : INR 299/- Published By : Sfurti Media Production , Pune  Preface In the fluid tapestry of life , where dreams interweave with destiny , and aspirations intersect with reality , stands a beacon of guidance and wisdom : the career counselor . As we stand at the precipice of 2023 , an era shaped by technological leaps , environmental challenges , and socio- economic transformations , the role of the career counselor has never been more critical , nor their task more intricate .

#### Save tokens in CSV file

In [7]:
def save_tokens_to_csv(tokens_string, filename):
    tokens = tokens_string.split()
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        for token in tokens:
            writer.writerow([token])  # Write each token on a new line

filename = "tokens.csv"
save_tokens_to_csv(processed_text, filename)

### Create paragraphs and encode the paragraphs into embeddings and save them in a FAISS Index

In [4]:
# Load the tokens from CSV again for reference
def load_tokens_from_csv(filename):
    tokens = []
    with open(filename, mode='r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:
                tokens.append(row[0])
    return tokens

tokens = load_tokens_from_csv("tokens.csv")

# Combine tokens into a single text
full_text = ' '.join(tokens)

# Load the Sentence Transformer model
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Function to split text into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Function to create overlapping paragraphs with 10 sentences per chunk
def create_overlapping_paragraphs(sentences, chunk_size=10, overlap=4):
    paragraphs = []
    for i in range(0, len(sentences), chunk_size - overlap):
        chunk = sentences[i:i + chunk_size]
        if chunk:  # Ensure the chunk is not empty
            paragraphs.append(' '.join(chunk))
    return paragraphs

# Split the full text into sentences
sentences = split_into_sentences(full_text)

# Create overlapping paragraphs
paragraphs = create_overlapping_paragraphs(sentences, chunk_size=10, overlap=4)

# Define paths for saved embeddings and index
embeddings_file = 'paragraph_embeddings.npy'
index_file = 'paragraph_faiss.index'

# Check if the embeddings and index already exist
if os.path.exists(embeddings_file) and os.path.exists(index_file):
    # Load paragraph embeddings from disk
    paragraph_embeddings = np.load(embeddings_file)
    
    # Load the FAISS index directly without passing an existing index object
    paragraph_index = faiss.read_index(index_file)
else:
    # Create embeddings for each paragraph
    paragraph_embeddings = embedding_model.encode(paragraphs)
    
    # Save paragraph embeddings to disk
    np.save(embeddings_file, paragraph_embeddings)
    
    # Initialize a new FAISS index for paragraphs
    dimension = paragraph_embeddings.shape[1]
    paragraph_index = faiss.IndexFlatL2(dimension)
    paragraph_index.add(np.array(paragraph_embeddings))
    
    # Save FAISS index to disk
    faiss.write_index(paragraph_index, index_file)

    print("FAISS Index and numpy array saved.")