In [25]:
import pdfplumber
import re
import nltk
from nltk.corpus import stopwords

import csv


In [20]:
# Download NLTK data (you only need to run this once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# Define a function to clean and preprocess text
def clean_text(text):
    """
    Cleans and preprocesses the text for vectorization.
    - Removes headers, footers, special characters, and unnecessary whitespace.
    - Tokenizes text into sentences.
    - Removes stop words.
    """
    # Remove headers and footers (assumption: they repeat every page)
    text = re.sub(r'CareerSeva.Com\'s Destiny Designers: A Comprehensive Guide to Career Counseling Entrepreneurship in India - 2023*|CareerSeva.Com -A2Z Career Guidance & Planning    © Sfurti Media Production, Pune.2023*', '', text)

    # Remove special characters and excessive whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    processed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        processed_sentences.append(' '.join(filtered_words))
    
    # Return cleaned sentences as a single string
    return ' '.join(processed_sentences)

In [22]:
# Define a function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file while handling images, tables, bullet points, headers, and footers.
    """
    extracted_text = ""
    
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page
        for page in pdf.pages:
            # Extract text from the page
            page_text = page.extract_text()
            
            # Skip pages with no text
            if not page_text:
                continue
            
            # Clean the extracted text
            cleaned_text = clean_text(page_text)
            
            # Append cleaned text
            extracted_text += cleaned_text + " "
    
    return extracted_text.strip()

In [23]:
def remove_footer(text):
    pattern = r"CareerSevaCom A2Z Career Guidance Planning Sfurti Media Production Pune2023 Page \d+"
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

In [24]:
# Path to your PDF file
pdf_path = 'Dataset.pdf'

# Extract and preprocess text from the PDF
processed_text = extract_text_from_pdf(pdf_path)

# Remove Footer
processed_text = remove_footer(processed_text)

# Print a preview of the processed text
print(processed_text[:1000])  # Display the first 1000 characters to verify

CareerSevaComs EBook Edition Destiny Designers 2023 Comprehensive Guide Career Counseling Entrepreneurship India Teacher Working SelfEmployed Professional Digital Content Creator Housewife Simply Graduate Want Make Rewarding Career Career Counseling Begin Journey  Unlocking Secrets Successful Career Counseling Future Educators Coaches Author Editor CareerSevacoms Expert Career Counselors Content Creation Team EBook MRP INR 299 Published Sfurti Media Production Pune  Preface fluid tapestry life dreams interweave destiny aspirations intersect reality stands beacon guidance wisdom career counselor stand precipice 2023 era shaped technological leaps environmental challenges socio economic transformations role career counselor never critical task intricate Comprehensive Training Guide Setting Successful Career Counseling Practice 2023 compilation theories strategies tools heartfelt ode every individual chosen noble pursuit lighting path others pages find latest trends insights professional 

In [26]:
def save_tokens_to_csv(tokens_string, filename):
    tokens = tokens_string.split()
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(tokens)

filename = "tokens.csv"
save_tokens_to_csv(processed_text, filename)