In [1]:
import spacy
import re
from PyPDF2 import PdfReader
from transformers import pipeline
import pandas as pd

In [2]:
def extract_and_clean_text(pdf_path, start_page):

    extracted_text = ""

    reader = PdfReader(pdf_path)

    total_pages = len(reader.pages)
    for page_number in range(start_page - 1, total_pages):
        page = reader.pages[page_number]
        page_text = page.extract_text()
        cleaned_text = ""
        cleaned_text = re.sub(r"<<.*?>>", "", page_text, flags=re.DOTALL)
        cleaned_text = re.sub(r"p\. \d+", "", cleaned_text, flags=re.DOTALL)
        cleaned_text = re.sub(r"Page \d+|Act [IVX]+|Scene \d+", "", cleaned_text)
        cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
        extracted_text += cleaned_text

    return extracted_text

In [3]:
pdf_path = "Shakespeare-Complete-Works.pdf"
pdf_text = extract_and_clean_text(pdf_path,77)
plays = pdf_text.split("THE END")

In [5]:
def extract_title(text_list, name="William Shakespeare"):
    results = []
    for text in text_list:
        match = re.search(r"(\d+)\s+(.*?)\s+" + re.escape(name), text)
        if match:
          text = match.group(2)
          match = re.search(r".*\d+\s+(.*)", text)
          pattern = re.compile(rf"by$", re.IGNORECASE)
          if match:
            cleaned_text = re.sub(pattern, "", match.group(1)).strip()
            results.append(cleaned_text)
          else:
            cleaned_text = re.sub(pattern, "", text).strip()
            results.append(cleaned_text)

        else:
            results.append(None)

    return results

In [53]:
titles = extract_title(plays)
titles = pd.DataFrame(titles, columns=['Title'])
titles = titles.dropna()[:36].reset_index().drop('index', axis=1)
file_path = "titles.csv"
titles.to_csv(file_path, index=False)
print(f"DataFrame saved to {file_path}")

DataFrame saved to titles.csv


In [7]:
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 4703297

def tokenize_large_text(text, chunk_size=100000):

    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    tokens = []
    for chunk in chunks:
        doc = nlp(chunk)
        chunk_tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
        tokens.extend(chunk_tokens)

    return tokens

In [28]:
title_list = titles.Title.unique()[1:37]
index_list = []
for title in title_list:
  index = titles.index[titles['Title'] == title].tolist()[0]
  index_list.append(index)

In [30]:
full_copus = []
for index in index_list:
  tokens = tokenize_large_text(plays[index])
  copus = " ".join(tokens)
  full_copus.append(copus)

In [31]:
import pickle

file_path = "copus.pkl"
with open(file_path, "wb") as file:
    pickle.dump(full_copus, file)

print(f"Copus saved to {file_path}")


Copus saved to copus.pkl


In [36]:
with open(file_path, "rb") as file:
    full_copus = pickle.load(file)

In [41]:
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [56]:
title = 'AS YOU LIKE IT'
index = titles.index[titles['Title'] == title].tolist()[0]
context = full_copus[index]
result = question_answerer(question="Who is LAFEU?", context=context)
print(f"Answer: '{result['answer']}'")

Answer: 'good housewife'
