In [None]:
# Data Preprocessing

# Installing necessary libraries
!pip3 install PyPDF2 nltk

# Importing libraries
import os
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Downloading NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to convert PDF to plain text
def pdf_to_text(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize into words
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

# Directory containing research papers (PDF files)
papers_directory = 'research/'

# List to store preprocessed text
preprocessed_text = []

# Iterate over research papers
for filename in os.listdir(papers_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(papers_directory, filename)
        # Convert PDF to plain text
        text = pdf_to_text(file_path)
        # Clean and tokenize text
        tokens = clean_and_tokenize(text)
        # Append preprocessed text to the list
        preprocessed_text.append(tokens)

print("Data Preprocessing completed.")

In [None]:
# Confirm Preprocessing Results

print(f"Number of preprocessed documents: {len(preprocessed_text)}")
print("\nSample preprocessed text:")
print(preprocessed_text[0][:50])  # Print the first 50 tokens of the first document

In [None]:
# Named Entity Recognition (NER)

# Installing necessary libraries
!pip3 install spacy

# Importing libraries
import spacy

# Loading the pre-trained NER model
nlp = spacy.load("en_core_web_trf")
# nlp = spacy.load("en_core_web_sm")

# Function to perform NER on preprocessed text
def perform_ner(text):
    doc = nlp(" ".join(text))
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

# Applying NER to preprocessed text
entities_list = []
for text in preprocessed_text:
    entities = perform_ner(text)
    entities_list.append(entities)

# Printing the extracted entities for each document
for i, entities in enumerate(entities_list):
    print(f"Document {i+1} entities:")
    for entity in entities:
        print(f"- {entity[0]} ({entity[1]})")
    print()

print("Named Entity Recognition completed.")