In [None]:
import fitz
import pandas as pd

In [None]:
#Loading the dataset
pdf_doc = fitz.open('..\data\med_knowledge0.pdf')
text = ""

In [None]:
for page_num in range(pdf_doc.page_count):
    page = pdf_doc.load_page(page_num)
    text += page.get_text()

In [None]:
df = pd.DataFrame([text], columns = ['text'])

In [None]:
#Converting to lowercase
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
#Removing URLs
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_urls(text):
    return url_pattern.sub(r'', text)
df['text'] = df['text'].apply(lambda x: remove_urls(x))

In [None]:
#Removing non-word characters
df = df.replace(to_replace = r'[^\w\s]', value = '', regex = True)

In [None]:
#Removing numbers
df = df.replace(to_replace = r'\d+', value = '', regex = True)

In [None]:
#Tokenization using NLTK
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
df['tokenized_text'] = df['text'].apply(lambda x: word_tokenize(x))

In [None]:
#Removing stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
#Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return [ps.stem(word) for word in text]
df['stemmed_text'] = df['tokenized_text'].apply(lambda x: stem_words(x))

In [None]:
#Lemmatization
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    def get_wordnet_pos(word):
        word_loc = nltk.pos_tag([word])[0][1][0].upper()
        pos_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return pos_dict.get(word_loc, wordnet.NOUN)
    result = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text]
    return result
df['lemmatized_text'] = df['tokenized_text'].apply(lambda x: lemmatize_words(x))