In [6]:
! pip install pypdf2 nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from nltk)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading regex-2024.5.15-cp312-cp312-manylinux_2_17_x86_64.manyl

In [10]:
import os
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Ensure you have the necessary NLTK data files
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/oni/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def process_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    # Further process each sentence (e.g., tokenizing into words)
    processed_text = [word_tokenize(sentence) for sentence in sentences]
    return processed_text

def save_processed_text(processed_text, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        for sentence in processed_text:
            file.write(' '.join(sentence) + '\n')


In [14]:
def process_pdfs_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            processed_text = process_text(text)
            output_path = os.path.join(folder_path, filename.replace('.pdf', '_processed.txt'))
            save_processed_text(processed_text, output_path)
            print(f'Processed and saved: {output_path}')


In [16]:
# Specify the path to your folder containing PDFs
pdf_folder_path = 'books'
process_pdfs_in_folder(pdf_folder_path)

Processed and saved: books/Book2_processed.txt
Processed and saved: books/Book1_processed.txt
Processed and saved: books/Book3_processed.txt


## Cleaning the Data

In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [18]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/oni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/oni/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/oni/nltk_data...


True

In [23]:
# Specify the path to your folder containing PDFs
pdf_folder_path = 'books'

In [21]:
# Function to clean text
def clean_text(text):
    # Step 1: Remove non-text characters and extra spaces
    cleaned_text = re.sub(r'[^A-Za-z\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    # Step 2: Convert to lowercase
    cleaned_text = cleaned_text.lower()
    
    # Step 3: Remove stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(cleaned_text)
    filtered_words = [word for word in words if word not in stop_words]
    
    # Step 4: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # Join words back to a single string
    cleaned_text = ' '.join(lemmatized_words)
    
    return cleaned_text


In [35]:
# Process each PDF file
for filename in os.listdir(pdf_folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, filename)
        
        # Read the text from the PDF
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
        
        # Clean the text
        cleaned_text = clean_text(text)
        
        # Define the output file path
        output_file_path = os.path.join(pdf_folder_path, filename.replace('.pdf', '_cleaned.txt'))
        
        # Write the cleaned text to a new file
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)
        
        print(f"Cleaned text saved to {output_file_path}")

Cleaned text saved to books/Book2_cleaned.txt
Cleaned text saved to books/Book1_cleaned.txt
Cleaned text saved to books/Book3_cleaned.txt
