# Extract text from PDF files

In [33]:
import fitz  # PyMuPDF
import os

# Path to the folder containing the PDF files
pdf_folder = "Harry-Potter-Dataset"

# Initialize an empty string to store the combined text
combined_text = ""

# Loop through all PDF files in the folder
for pdf_file in sorted(os.listdir(pdf_folder)):
    if pdf_file.endswith(".pdf"):
        # Construct the full path to the PDF file
        pdf_path = os.path.join(pdf_folder, pdf_file)
        
        # Open the PDF file
        pdf = fitz.open(pdf_path)
        
        # Extract text from each page
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            combined_text += page.get_text()
        
        print(f"Extracted text from {pdf_file}")
        
print('\n Extracted text from all PDF files')

Extracted text from HP1.pdf
Extracted text from HP2.pdf
Extracted text from HP3.pdf
Extracted text from HP4.pdf
Extracted text from HP5.pdf
Extracted text from HP6.pdf
Extracted text from HP7.pdf

 Extracted text from all PDF files


In [34]:
# output_file = "combined_harry_potter.txt"
# with open(output_file, "w", encoding="utf-8") as f:
#     f.write(combined_text)

# print(f"Combined text saved to {output_file}")

# with open("combined_harry_potter.txt", "r", encoding="utf-8") as f:
#     combined_text = f.read()

# Print the first 1000 characters as a preview
print(combined_text[5000:6000])

briefcase, pecked 
Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but 
missed, because Dudley was now having a tantrum and throwing 
his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left 
the house. He got into his car and backed out of number four’s 
drive. 
It was on the corner of the street that he noticed the first sign of 
something peculiar — a cat reading a map. For a second, Mr. 
Dursley didn’t realize what he had seen — then he jerked his head 
around to look again. There was a tabby cat standing on the corner  
 
THE  BOY  WHO  LIVED 
 3  
of Privet Drive, but there wasn’t a map in sight. What could he 
have been thinking of? It must have been a trick of the light. Mr. 
Dursley blinked and stared at the cat. It stared back. As Mr. Durs-
ley drove around the corner and up the road, he watched the cat in 
his mirror. It was now reading the sign that said Privet Drive — no, 
looking at the sign; cats couldn’t read maps or signs. Mr. Dursley 
gav

# Normalize the text

In [35]:
import re

def normalize_text(text):
    cleaned_text = re.sub(r'\s+', ' ', text)  # Replace all sequences of whitespace (spaces, tabs, newlines) with a single space
    cleaned_text = re.sub(r'#', '', cleaned_text)  # Remove special characters like '#'
    return cleaned_text.strip()  # Remove leading/trailing spaces

In [36]:
cleaned_text = normalize_text(combined_text)
cleaned_text[:1000]

'Harry Potter And the Sorcerer’s Stone ALSO BY J. K. ROWLING Harry Potter and the Sorcerer’s Stone Year One at Hogwarts Harry Potter and the Chamber of Secrets Year Two at Hogwarts Harry Potter and the Prisoner of Azkaban Year Three at Hogwarts Harry Potter and the Goblet of Fire Year Four at Hogwarts Harry Potter and the Order of the Phoenix Year Five at Hogwarts Harry Potter and the Half-Blood Prince Year Six at Hogwarts Harry Potter and the Deathly Hallows Year Seven at Hogwarts Harry Potter and the Sorcerer’s Stone BY J. K. Rowling ILLUSTRATIONS BY Mary GrandPré ARTHUR A. LEVINE BOOKS AN IMPRINT OF SCHOLASTIC Press. For Jessica, who loves stories for Anne, who loved them too; and for Di, who heard this one first. Text copyright © 1997 by J.K. Rowling Illustrations by Mary GrandPré copyright © 1998 Warner Bros. All rights reserved. Published by Scholastic Press, a division of Scholastic Inc., Publishers since 1920 SCHOLASTIC, SCHOLASTIC PRESS, and the LANTERN LOGO are trademarks and

# Tokenize the text into sentences

In [37]:
import nltk
nltk.download('punkt')
sentences = nltk.sent_tokenize(cleaned_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ws-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
import pandas as pd

data = pd.DataFrame({
    'text': sentences
})

In [39]:
data

Unnamed: 0,text
0,Harry Potter And the Sorcerer’s Stone ALSO BY ...
1,"For Jessica, who loves stories for Anne, who l..."
2,Text copyright © 1997 by J.K. Rowling Illustra...
3,"Published by Scholastic Press, a division of S..."
4,"For information regarding permissions, write t..."
...,...
83263,This book was art directed by David Saylor.
83264,The art for both the jacket and the interior w...
83265,"The text was set in 12-point Adobe Garamond, a..."
83266,The book was typeset by Brad Walrod and was pr...


In [40]:
import os

data.to_csv('cleaned_story.csv', index=False)

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import pandas as pd
# import torchtext, datasets, math, torchtext
# from tqdm import tqdm
# from datasets import Dataset, DatasetDict

In [42]:
# hf_dataset = Dataset.from_pandas(data)

# hf_dataset = hf_dataset.shuffle(seed=42)

# train_size = int(0.8 * len(hf_dataset))
# validation_size = int(0.2 * len(hf_dataset))

# train_dataset = hf_dataset.select(range(train_size))
# validation_dataset = hf_dataset.select(range(train_size, train_size + validation_size))
# test_dataset = hf_dataset.select(range(train_size + validation_size, len(hf_dataset)))

# final_dataset = DatasetDict({
#     'train': train_dataset,
#     'validation': validation_dataset,
#     'test': test_dataset
# })


In [43]:
# final_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 66614
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 16653
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [44]:
# final_dataset.save_to_disk("harry_potter_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 66614/66614 [00:00<00:00, 324906.41 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16653/16653 [00:00<00:00, 313177.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 157.57 examples/s]
