In [21]:
!pip install PyMuPDF
!pip install rank_bm25
!pip install langchain
!pip install sentence-transformers

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.0-cp310-none-manylinux2014_x86_64.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.0 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.0 PyMuPDFb-1.24.0


In [22]:
import numpy as np
import fitz  # PyMuPDF
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import PyPDF2
from nltk.tokenize import sent_tokenize

In [9]:
# Ensuring necessary NLTK datasets are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
def load_dataset(path):
    with open(path) as file:
        dataset = json.load(file)
    return dataset

dataset = load_dataset('/content/drive/MyDrive/Data/train-v2.0.json')

In [24]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texts = []
    for page in doc:
        texts.append(page.get_text())
    return texts

In [41]:
document_context = '/content/drive/MyDrive/Data/contexts_train.pdf'
docs = extract_text_from_pdf(document_context)

In [23]:
# Preprocessing
def preprocess_documents(docs):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    preprocessed_docs = []
    for doc in docs:
        doc = doc.lower()
        doc = doc.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(doc)
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stop_words]
        preprocessed_docs.append(' '.join(tokens))
    return preprocessed_docs

In [42]:
#Preprocess Text Data
preprocessed_docs = preprocess_documents(docs)

In [43]:
# Example of vectorizing the preprocessed documents
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

In [44]:
# Outputs
print("Preprocessed Documents:")
for doc in docs:
    print(doc[:100])  # Print the first 100 characters of each document to check
print("\nShape of the TF-IDF matrix:", X.shape)

Preprocessed Documents:
Beyoncé Giselle Knowles-Carter (/biIIjInseI/ bee-YON-say) (born September 4, 1981) is an
American si
performed was not good. In 1995 Beyoncé's father resigned from his job to manage the group. The
move
album, with Missy Elliott, MC Lyte, and Free which was also used to promote the film. Another of
Bey
Achieving the accomplishment of becoming her longest-running Hot 100 single in her career, "Halo"'s

"End of Time". "Eat, Play, Love", a cover story written by Beyoncé for Essence that detailed her 201
America festival in early September and also the Global Citizen Festival later that month. Beyoncé
m
In 2015 Beyoncé signed an open letter which the ONE Campaign had been collecting signatures for;
the
sharing them with producers.
In 2001, she became the first African-American woman and second woman s
Described as being "sexy, seductive and provocative" when performing on stage, Beyoncé has said
that
singles – particularly R&B; singles – regained their status as pop'

In [45]:
# Splitting Text into Chunks i.e. each document into sentences
docs_sentences = [sent_tokenize(doc) for doc in docs]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
print(docs_sentences)



In [46]:
# Generating Embeddings
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Flattening the list of sentences
sentences = [sentence for doc in docs_sentences for sentence in doc]
sentence_embeddings = model.encode(sentences)


In [47]:
# BM25 for Sparse Retrieval
from rank_bm25 import BM25Okapi

# Tokenizing the sentences for BM25
tokenized_docs = [doc.split() for doc in sentences]
bm25 = BM25Okapi(tokenized_docs)

In [52]:
# Query and Retrieve Documents
query = "In what country is Normandy located?"
tokenized_query = query.split()

# Get scores for each document or sentence
doc_scores = bm25.get_scores(tokenized_query)

# Finding the highest scoring documents
top_doc_indices = np.argsort(doc_scores)[::-1][:5]

for idx in top_doc_indices:
    print(f"Doc {idx + 1}: {sentences[idx]} (Score: {doc_scores[idx]:.2f})")
    print("\n")

Doc 89892: The country joined what is now the European Union in
1981. (Score: 14.99)


Doc 27494: In 1214 John began his final campaign to reclaim Normandy from Philip. (Score: 13.57)


Doc 38066: Civil disorder spread
throughout the country in what became known as the Autumn uprising. (Score: 12.08)


Doc 44383: In English, the country is popularly known as either "Burma" or "Myanmar" i/ImjIInImIIr/. (Score: 11.14)


Doc 30985: In the Channel Islands and Isle of Man, which are Crown dependencies rather than
separate realms, she is known as Duke of Normandy and Lord of Mann, respectively. (Score: 11.12)


