In [15]:
import fitz
import pandas as pd

In [16]:
#Loading the dataset
pdf_doc = fitz.open('..\data\med_knowledge0.pdf')
text = ""

In [17]:
for page_num in range(pdf_doc.page_count):
    page = pdf_doc.load_page(page_num)
    text += page.get_text()
    #print(text)

In [18]:
df = pd.DataFrame([text], columns = ['text'])
df.head()

Unnamed: 0,text
0,"May 28, 2024 ? Stair climbing has long been to..."


In [19]:
#Converting to lowercase
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df 

Unnamed: 0,text
0,"may 28, 2024 ? stair climbing has long been to..."


In [20]:
#Removing URLs
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_urls(text):
    return url_pattern.sub(r'', text)
df['text'] = df['text'].apply(lambda x: remove_urls(x))
df

Unnamed: 0,text
0,"may 28, 2024 ? stair climbing has long been to..."


In [21]:
#Removing non-word characters
df = df.replace(to_replace = r'[^\w\s]', value = '', regex = True)
df

Unnamed: 0,text
0,may 28 2024 stair climbing has long been tout...


In [24]:
#Saving to csv to review the result
df.to_csv('../data/med_knowledge0.csv', encoding = 'utf-8-sig', index=False)

In [22]:
#Removing numbers
df = df.replace(to_replace = r'\d+', value = '', regex = True)
df

Unnamed: 0,text
0,may stair climbing has long been touted as ...


In [10]:
#Tokenization using BertTokenizer
#from transformers import BertTokenizer
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#df['tokenized_text'] = df['text'].apply(lambda x: tokenizer.tokenize(x)) 
#df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,tokenized_text
0,may stair climbing has long been touted as ...,"[may, stair, climbing, has, long, been, to, ##..."


In [23]:
#Tokenization using NLTK
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

df['tokenized_text'] = df['text'].apply(lambda x: word_tokenize(x))
df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,tokenized_text
0,may stair climbing has long been touted as ...,"[may, stair, climbing, has, long, been, touted..."


In [25]:
#Removing stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])
df

Unnamed: 0,text,tokenized_text
0,may stair climbing has long been touted as ...,"[may, stair, climbing, long, touted, feasible,..."


In [27]:
#Stemming
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    return [ps.stem(word) for word in text]

df['stemmed_text'] = df['tokenized_text'].apply(lambda x: stem_words(x))
df


Unnamed: 0,text,tokenized_text,stemmed_text
0,may stair climbing has long been touted as ...,"[may, stair, climbing, long, touted, feasible,...","[may, stair, climb, long, tout, feasibl, free,..."


In [31]:
#Lemmatization
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    def get_wordnet_pos(word):
        word_loc = nltk.pos_tag([word])[0][1][0].upper()
        pos_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return pos_dict.get(word_loc, wordnet.NOUN)
    result = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text]
    return result

df['lemmatized_text'] = df['tokenized_text'].apply(lambda x: lemmatize_words(x))
df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,text,tokenized_text,stemmed_text,lemmatized_text
0,may stair climbing has long been touted as ...,"[may, stair, climbing, long, touted, feasible,...","[may, stair, climb, long, tout, feasibl, free,...","[may, stair, climb, long, tout, feasible, free..."
