In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Sample job descriptions
job_descriptions = [
    "Looking for a data analyst skilled in Python, SQL, and Tableau with experience in cleaning and visualizing large datasets.",
    "Seeking an ML engineer with deep learning knowledge, TensorFlow, and data preprocessing experience.",
    "Hiring a business analyst familiar with Excel, stakeholder management, and BI tools like Power BI or Looker.",
    "Searching for a backend developer with Flask, Django, API development, and relational database skills."
]

# Your resume
my_resume = [
    "Experienced in Python, SQL, and Excel. Built data visualizations with Seaborn and Matplotlib. Created machine learning models with scikit-learn for business forecasting."
]


In [3]:
vectorizer = TfidfVectorizer()
all_text = my_resume + job_descriptions  # Combine resume and job posts
tfidf_matrix = vectorizer.fit_transform(all_text)


In [4]:
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])


In [5]:
results = pd.DataFrame({
    'Job Description': job_descriptions,
    'Match Score': cos_sim[0]
})

results = results.sort_values(by='Match Score', ascending=False)
results


Unnamed: 0,Job Description,Match Score
0,"Looking for a data analyst skilled in Python, ...",0.307021
1,Seeking an ML engineer with deep learning know...,0.149729
2,"Hiring a business analyst familiar with Excel,...",0.13414
3,"Searching for a backend developer with Flask, ...",0.097891


In [6]:
# Step 1: Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 2: Job descriptions (mock examples – replace with real ones)
job_descriptions = [
    "Looking for a data analyst skilled in Python, SQL, and Tableau with experience in cleaning and visualizing large datasets.",
    "Seeking an ML engineer with deep learning knowledge, TensorFlow, and data preprocessing experience.",
    "Hiring a business analyst familiar with Excel, stakeholder management, and BI tools like Power BI or Looker.",
    "Searching for a backend developer with Flask, Django, API development, and relational database skills."
]

# Step 3: Resume (paste your actual resume text here)
my_resume = [
    "Experienced in Python, SQL, and Excel. Built data visualizations with Seaborn and Matplotlib. Created machine learning models with scikit-learn for business forecasting."
]

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
combined_text = my_resume + job_descriptions
tfidf_matrix = vectorizer.fit_transform(combined_text)

# Step 5: Cosine similarity
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

# Step 6: Output results
results = pd.DataFrame({
    'Job Description': job_descriptions,
    'Match Score': cos_sim[0]
}).sort_values(by='Match Score', ascending=False)

# Step 7: Display top matches
results.reset_index(drop=True, inplace=True)
results


Unnamed: 0,Job Description,Match Score
0,"Looking for a data analyst skilled in Python, ...",0.307021
1,Seeking an ML engineer with deep learning know...,0.149729
2,"Hiring a business analyst familiar with Excel,...",0.13414
3,"Searching for a backend developer with Flask, ...",0.097891


In [20]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)  # remove symbols/numbers
    tokens = text.lower().split()
    filtered = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(filtered)

# Apply cleaning
cleaned_resume = [clean_text(my_resume[0])]
cleaned_jobs = [clean_text(jd) for jd in job_descriptions]

# TF-IDF again with cleaned text
vectorizer = TfidfVectorizer()
combined_clean = cleaned_resume + cleaned_jobs
tfidf_matrix = vectorizer.fit_transform(combined_clean)

# Rerun similarity
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
results = pd.DataFrame({
    'Job Description': job_descriptions,
    'Match Score': cos_sim[0]
}).sort_values(by='Match Score', ascending=False).reset_index(drop=True)

results



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Job Description,Match Score
0,"Looking for a data analyst skilled in Python, ...",0.149137
1,Seeking an ML engineer with deep learning know...,0.100403
2,"Hiring a business analyst familiar with Excel,...",0.094149
3,"Searching for a backend developer with Flask, ...",0.0
