In [None]:
pip install nltk



In [1]:
import re
import nltk
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer, MWETokenizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [15]:
resume_text = """
John Snow
Email: john.snow@example.com | Phone: +1 123-456-7890
Objective: To secure a challenging role in AI development.
Education: B.Sc. in Computer Science, XYZ University, 2020
Skills: Python, NLP, Machine Learning, TensorFlow, Keras
Experience:
- AI Engineer at ABC Corp (2021–Present): Developed chatbots and recommendation systems.
- Intern at DEF Inc. (2020): Assisted in data preprocessing for AI models.
"""
skills_list = {"python", "nlp", "machine learning", "tensorflow", "keras", "ai"}

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_and_filter(tokens, stop_words, lemmatizer):
    return [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in stop_words
    ]


tokenizer = WhitespaceTokenizer()
resume_tokens = preprocess_and_filter(tokenizer.tokenize(resume_text), stop_words, lemmatizer)
job_tokens = preprocess_and_filter(tokenizer.tokenize(job_description), stop_words, lemmatizer)

matching_skills = skills_list.intersection(set(resume_tokens), set(job_tokens))
print("Matching Skills:", matching_skills)

Matching Skills: {'ai'}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
stop_words = set(stopwords.words('english'))
def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

In [5]:
cleaned_text = re.sub(r'\s+', ' ', resume_text)
cleaned_text = re.sub(r'[^\w\s@.-]', '', cleaned_text)
print(cleaned_text)

 John Snow Email john.snow@example.com  Phone 1 123-456-7890 Objective To secure a challenging role in AI development. Education B.Sc. in Computer Science XYZ University 2020 Skills Python NLP Machine Learning TensorFlow Keras Experience - AI Engineer at ABC Corp 2021Present Developed chatbots and recommendation systems. - Intern at DEF Inc. 2020 Assisted in data preprocessing for AI models. 


In [6]:
whitespace_tokenizer = WhitespaceTokenizer()
tokens_whitespace = whitespace_tokenizer.tokenize(cleaned_text)
print("Whitespace Tokens:", tokens_whitespace)

word_punct_tokenizer = WordPunctTokenizer()
tokens_punct = word_punct_tokenizer.tokenize(cleaned_text)
print("Punctuation-Based Tokens:", tokens_punct)

treebank_tokenizer = TreebankWordTokenizer()
tokens_treebank = treebank_tokenizer.tokenize(cleaned_text)
print("Treebank Tokens:", tokens_treebank)

tweet_tokenizer = TweetTokenizer()
tokens_tweet = tweet_tokenizer.tokenize(cleaned_text)
print("Tweet Tokens:", tokens_tweet)

Whitespace Tokens: ['John', 'Snow', 'Email', 'john.snow@example.com', 'Phone', '1', '123-456-7890', 'Objective', 'To', 'secure', 'a', 'challenging', 'role', 'in', 'AI', 'development.', 'Education', 'B.Sc.', 'in', 'Computer', 'Science', 'XYZ', 'University', '2020', 'Skills', 'Python', 'NLP', 'Machine', 'Learning', 'TensorFlow', 'Keras', 'Experience', '-', 'AI', 'Engineer', 'at', 'ABC', 'Corp', '2021Present', 'Developed', 'chatbots', 'and', 'recommendation', 'systems.', '-', 'Intern', 'at', 'DEF', 'Inc.', '2020', 'Assisted', 'in', 'data', 'preprocessing', 'for', 'AI', 'models.']
Punctuation-Based Tokens: ['John', 'Snow', 'Email', 'john', '.', 'snow', '@', 'example', '.', 'com', 'Phone', '1', '123', '-', '456', '-', '7890', 'Objective', 'To', 'secure', 'a', 'challenging', 'role', 'in', 'AI', 'development', '.', 'Education', 'B', '.', 'Sc', '.', 'in', 'Computer', 'Science', 'XYZ', 'University', '2020', 'Skills', 'Python', 'NLP', 'Machine', 'Learning', 'TensorFlow', 'Keras', 'Experience', '

In [7]:
porter_stemmer = PorterStemmer()
stemmed_words_porter = [porter_stemmer.stem(word) for word in tokens_whitespace]
print("Porter Stemmed Words:", stemmed_words_porter)

snowball_stemmer = SnowballStemmer("english")
stemmed_words_snowball = [snowball_stemmer.stem(word) for word in tokens_whitespace]
print("Snowball Stemmed Words:", stemmed_words_snowball)

Porter Stemmed Words: ['john', 'snow', 'email', 'john.snow@example.com', 'phone', '1', '123-456-7890', 'object', 'to', 'secur', 'a', 'challeng', 'role', 'in', 'ai', 'development.', 'educ', 'b.sc.', 'in', 'comput', 'scienc', 'xyz', 'univers', '2020', 'skill', 'python', 'nlp', 'machin', 'learn', 'tensorflow', 'kera', 'experi', '-', 'ai', 'engin', 'at', 'abc', 'corp', '2021present', 'develop', 'chatbot', 'and', 'recommend', 'systems.', '-', 'intern', 'at', 'def', 'inc.', '2020', 'assist', 'in', 'data', 'preprocess', 'for', 'ai', 'models.']
Snowball Stemmed Words: ['john', 'snow', 'email', 'john.snow@example.com', 'phone', '1', '123-456-7890', 'object', 'to', 'secur', 'a', 'challeng', 'role', 'in', 'ai', 'development.', 'educ', 'b.sc.', 'in', 'comput', 'scienc', 'xyz', 'univers', '2020', 'skill', 'python', 'nlp', 'machin', 'learn', 'tensorflow', 'kera', 'experi', '-', 'ai', 'engin', 'at', 'abc', 'corp', '2021present', 'develop', 'chatbot', 'and', 'recommend', 'systems.', '-', 'intern', 'at

In [8]:
lemmatizer = WordNetLemmatizer()

def lemmatize_with_pos(word):
    return lemmatizer.lemmatize(word, pos=wordnet.VERB)

lemmatized_words = [lemmatize_with_pos(word) for word in tokens_whitespace]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['John', 'Snow', 'Email', 'john.snow@example.com', 'Phone', '1', '123-456-7890', 'Objective', 'To', 'secure', 'a', 'challenge', 'role', 'in', 'AI', 'development.', 'Education', 'B.Sc.', 'in', 'Computer', 'Science', 'XYZ', 'University', '2020', 'Skills', 'Python', 'NLP', 'Machine', 'Learning', 'TensorFlow', 'Keras', 'Experience', '-', 'AI', 'Engineer', 'at', 'ABC', 'Corp', '2021Present', 'Developed', 'chatbots', 'and', 'recommendation', 'systems.', '-', 'Intern', 'at', 'DEF', 'Inc.', '2020', 'Assisted', 'in', 'data', 'preprocessing', 'for', 'AI', 'models.']


In [9]:
email = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)
print("Email:", email.group() if email else "Not found")

Email: john.snow@example.com


In [10]:
phone = re.search(r'\+?\d[\d -]{8,}\d', cleaned_text)
print("Phone:", phone.group() if phone else "Not found")

Phone: 1 123-456-7890


In [11]:
skills = re.findall(r'\b(Python|NLP|Machine Learning|TensorFlow|Keras)\b', cleaned_text, re.IGNORECASE)
print("Skills:", skills)

Skills: ['Python', 'NLP', 'Machine Learning', 'TensorFlow', 'Keras']


In [12]:
education = re.search(r'(B\.Sc\.|M\.Sc\.|Ph\.D\.) in [\w\s]+', cleaned_text)
print("Education:", education.group() if education else "Not found")

Education: B.Sc. in Computer Science XYZ University 2020 Skills Python NLP Machine Learning TensorFlow Keras Experience 


In [13]:
job_description = """
Looking for an AI Engineer skilled in Python, NLP, and Machine Learning.
"""

job_tokens = [lemmatizer.lemmatize(word.lower()) for word in WhitespaceTokenizer().tokenize(job_description)]
job_tokens_filtered = remove_stop_words(job_tokens)

resume_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens_whitespace]
resume_tokens_filtered = remove_stop_words(resume_tokens)

common_skills = set(job_tokens_filtered).intersection(resume_tokens_filtered)
print("Matching Skills:", common_skills)

tokenizer = WhitespaceTokenizer()
resume_tokens = preprocess_and_filter(tokenizer.tokenize(resume_text), stop_words, lemmatizer)
job_tokens = preprocess_and_filter(tokenizer.tokenize(job_description), stop_words, lemmatizer)

Matching Skills: {'machine', 'ai', 'engineer'}


In [14]:
def analyze_resume(resume_text, job_description):
    cleaned_text = re.sub(r'\s+', ' ', resume_text)
    cleaned_text = re.sub(r'[^\w\s@.-]', '', cleaned_text)

    tokens = WhitespaceTokenizer().tokenize(cleaned_text)
    lemmatizer = WordNetLemmatizer()
    normalized_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens]

    email = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)
    phone = re.search(r'\+?\d[\d -]{8,}\d', cleaned_text)
    skills = re.findall(r'\b(Python|NLP|Machine Learning|TensorFlow|Keras)\b', cleaned_text, re.IGNORECASE)

    job_tokens = [lemmatizer.lemmatize(word.lower()) for word in WhitespaceTokenizer().tokenize(job_description)]
    matching_skills = set(normalized_tokens).intersection(job_tokens)

    return {
        "email": email.group() if email else "Not found",
        "phone": phone.group() if phone else "Not found",
        "skills": skills,
        "matching_skills": list(matching_skills),
    }

job_description = "Looking for AI Engineer skilled in Python, NLP, and Machine Learning."
result = analyze_resume(resume_text, job_description)
print("Analysis Result:", result)

Analysis Result: {'email': 'john.snow@example.com', 'phone': '1 123-456-7890', 'skills': ['Python', 'NLP', 'Machine Learning', 'TensorFlow', 'Keras'], 'matching_skills': ['for', 'in', 'ai', 'engineer', 'and', 'machine']}
