In [19]:
!pip install PyPDF2 pdfplumber nltk regex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
import PyPDF2
import pdfplumber
import nltk
import regex as re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [22]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        # Using PyPDF2
        pdf = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

        # Alternatively, using pdfplumber
        # with pdfplumber.open(file) as pdf:
        #     for page in pdf.pages:
        #         text += page.extract_text()

    return text

In [23]:
def preprocess_text(text):
    # Remove special characters and numbers
    processed_text = re.sub(r'[^\w\s]', ' ', text)
    processed_text = re.sub(r'\d+', '', processed_text)

    # Tokenize the text
    tokens = word_tokenize(processed_text.lower())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    return filtered_tokens

In [24]:
def extract_keywords(tokens, num_keywords=30):
    freq_dist = nltk.FreqDist(tokens)
    keywords = [keyword for keyword, _ in freq_dist.most_common(num_keywords)]
    return keywords

In [25]:
def scan_pdf_for_keywords(file_path):
    # Step 1: Extract text from the PDF
    text = extract_text_from_pdf(file_path)

    # Step 2: Preprocess the text
    tokens = preprocess_text(text)

    # Step 3: Extract keywords
    keywords = extract_keywords(tokens)

    return keywords

In [26]:
file_path = '/content/junior-data-analyst-resume-example.pdf'
keywords = scan_pdf_for_keywords(file_path)
print(keywords)

['data', 'analytics', 'analyst', 'ca', 'san', 'francisco', 'sql', 'reports', 'gender', 'age', 'junior', 'professional', 'tomislav', 'ad', 'hoc', 'including', 'business', 'power', 'bi', 'applications', 'tools', 'project', 'digital', 'detection', 'python', 'based', 'computer', 'facial', 'career', 'science']
