In [5]:
# Data Preprocessing

# Installing necessary libraries
!pip3 install PyPDF2 nltk

# Importing libraries
import os
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Downloading NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to convert PDF to plain text
def pdf_to_text(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize into words
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

# Directory containing research papers (PDF files)
papers_directory = 'research/'

# List to store preprocessed text
preprocessed_text = []

# Iterate over research papers
for filename in os.listdir(papers_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(papers_directory, filename)
        # Convert PDF to plain text
        text = pdf_to_text(file_path)
        # Clean and tokenize text
        tokens = clean_and_tokenize(text)
        # Append preprocessed text to the list
        preprocessed_text.append(tokens)

print("Data Preprocessing completed.")

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


[nltk_data] Downloading package punkt to /Users/ulyssorok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ulyssorok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Preprocessing completed.


In [6]:
# Confirm Preprocessing Results

print(f"Number of preprocessed documents: {len(preprocessed_text)}")
print("\nSample preprocessed text:")
print(preprocessed_text[0][:50])  # Print the first 50 tokens of the first document

Number of preprocessed documents: 48

Sample preprocessed text:
['issue', 'treatment', 'anailing', 'earth', '51', '7', '1', '63216', '3ulqw', 'kdqjh', '23', '5', 'v', '8', '2', '9', 'kdqjh', '23', '57', 'kdqjh', '675', '6', '5', '0', 'kdqjh', '6', 'kdqjh', '5', 'kdqjh', 'kdqjh', 'oh', '3ulqw', 'uwlfoh', 'oh', 'evwudfw', 'dqwlelrwlf', 'dqdojhvlf', 'kh', 'l', 'p', 'l', 'w', 'h', 'g', 'r', 'glr', 'xwkru', 'oh', '51', '7']


In [8]:
# Named Entity Recognition (NER)

# Installing necessary libraries
!pip3 install spacy

# Importing libraries
import spacy

# Loading the pre-trained NER model
nlp = spacy.load("en_core_web_sm")

# Function to perform NER on preprocessed text
def perform_ner(text):
    doc = nlp(" ".join(text))
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

# Applying NER to preprocessed text
entities_list = []
for text in preprocessed_text:
    entities = perform_ner(text)
    entities_list.append(entities)

# Printing the extracted entities for each document
for i, entities in enumerate(entities_list):
    print(f"Document {i+1} entities:")
    for entity in entities:
        print(f"- {entity[0]} ({entity[1]})")
    print()

print("Named Entity Recognition completed.")

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Document 1 entities:
- 23 (CARDINAL)
- 23 57 (CARDINAL)
- 675 (CARDINAL)
- 5 (CARDINAL)
- 6 (CARDINAL)
- 5 (CARDINAL)
- dqwlelrwlf dqdojhvlf kh l p l w (ORG)
- 70 (CARDINAL)
- 0f (CARDINAL)
- g q f u h (ORG)
- 6 2 (DATE)
- 51 (CARDINAL)
- 5 (CARDINAL)
- sdqvlrq (GPE)
- frpsoh (GPE)
- 70 (CARDINAL)
- 51 (CARDINAL)
- 5 (CARDINAL)
- 70 (MONEY)
- shfwhg p e u n k h (ORG)
- 5 (CARDINAL)
- 6slv (DATE)
- g k h l j k h (ORG)
- 3 (CARDINAL)
- 51 (CARDINAL)
- 5 (CARDINAL)
- 60 (CARDINAL)
- 6xq 51 (DATE)
- 6 (CARDINAL)

Document 2 entities:
- 5b6 (CARDINAL)
- principleby analogue neural networks (ORG)
- moon easy potentiallyuseful resources (GPE)
- moon moon (GPE)
- baseson moon (PERSON)
- 3d (CARDINAL)
- u h r u n e x c l u v el c e n et (ORG)
- os p r n (ORG)
- 2 2 (DATE)
- 2022 (DATE)
