In [1]:
import fitz

def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text
pdf_path = "Tenzin Norzin Resume.pdf"#add resume in pdf format
text = pdf_to_text(pdf_path)

In [5]:
#preprocessing
import re
import spacy
    
def preprocess_resume(text):
        #removing contacts info
    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",'', text)
    text = re.sub(r"\+?\d{1,3}[-.\s]?\(?\d{2,3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",'', text)
        
        #removing URLs
    text = re.sub(r'http\S+\s*', '', text)
        
        #removes common Twitter related terms
    text = re.sub('RT|cc', ' ', text)
        
        #removing hashtags
    text = re.sub(r'#\S+', '', text)
        
        #removing mentions
    text = re.sub(r'@\S+', ' ', text)
        
        #removing punctuations
    text = re.sub('[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_{|}~"""), ' ', text)
        
        #removing Non-ASCII Characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
        
        #remove social media mentions
    text = re.sub(r'\b(linkedin|github|coursera|kaggle|hackerank|location)\b', '', text, flags=re.IGNORECASE)
        
    month_pattern = r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December|month|year)s?\b"
    text = re.sub(month_pattern, '', text, flags=re.IGNORECASE)
        
        #removing extra whitespace
    text = re.sub(r'\s+', ' ', text)
        
    nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
    text = text
    doc = nlp(text) 
        
        #removing name and location
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            names = ent.text.split()
            pattern = r'\b(?:' + '|'.join(map(re.escape, names)) + r')\b'
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
                
        elif ent.label_ in {"LOC", "GPE", "ORG"}:
            loca = ent.text.split()
            pattern_loc = r'\b(?:' +'|'.join(map(re.escape, loca)) + r')\b'
            text = re.sub(pattern_loc, '', text, flags=re.IGNORECASE)
            
        # Remove dates in various formats (MM/DD/YYYY, YYYY-MM-DD, etc.)
    date_pattern = r"\b(?:\d{1,2}[-/\.]?\d{1,2}[-/\.]?\d{2,4}|\d{4}[-/\.]?\d{1,2}[-/\.]?\d{1,2}|\d{1,2}\s+[A-Za-z]+\s+\d{4})\b"
    text = re.sub(date_pattern, '', text)
        
    text = re.sub(r'\d+', '', text)
        #text = re.sub(r'\b\w\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
        
        #label encoder
        #tokenization & stop word removal
    doc = nlp(text)
    text = [token.text for token in doc if not token.is_stop]
    text = ' '.join(text)
        
    return text
    
    '''text = "Atomcode is Hiring C Data Analysts c (Work From Home) C++ \n We are hiring talented and enthusiastic Data Analysts for an immediate joiner position. This is a work-from-home opportunity!\n Requirements:\n Proficiency in Excel, SQL, and data visualization tools like Power BI or Tableau.\n Strong analytical and problem-solving skills.\n Knowledge of data cleaning, preprocessing, and statistical analysis.\n Familiarity with Python or R for data analysis is a plus.\n Join us and make an impact with your skills!"
    a = text.split()
    print(len(a))'''
cleaned_text = preprocess_resume(text)
print(cleaned_text)
print(len(cleaned_text))

TENZIN EDUCATION B Tech Hons Computer Pandas Supervised Unsupervised Deep Mathematical Foundations Statistics Probability Algebra Calculus Matrices Database Management MySQL NoSQL MongoDB Programming Languages C R CE IFICATIONS DSA LeetCode Exercises Intermediate course SkillsBuild openHPI PROJECTS Psychbot Intern Initiated blogs informative posts developed strategies products Managed inventory items including conducting market Bug Clasher Shoot Bug Quiz Craze Developing driven job portal disabled students leveraging deep enhance essibility personalized job matching course Designed based recommendation system utilizes distinct filters MRI scans assess predict stages Alzheimer s disease aiming facilitate earlier intervention enhance patient outcomes Developed theoretical offline bus ticket payment system enable digital transactions areas limited internet connectivity DSA LeetCode Exercises techniques analyze user provided predicting mental health outcomes providing advice suggestions ap



In [7]:
#Implementing Cosine Similarity in Python (TF-IDF Approach)
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

file_path = r"C:\Users\HP\OneDrive\Documents\Projects\final year project\job description\job description dataset.xlsx"
df = pd.read_excel(file_path)

df = df.fillna("")

job_descriptions = df["job_description"].to_list()
company_names = df["company name"].to_list()
job_titles = df["job_title"].to_list()

resume = cleaned_text

model = SentenceTransformer('all-MiniLM-L6-v2')

job_embeddings = model.encode(job_descriptions)
resume_embedding = model.encode(resume)

similarity_scores = cosine_similarity([resume_embedding], job_embeddings)[0]

ranked_jobs = sorted(enumerate(similarity_scores), key=lambda x: x[1], reverse=True)

for rank, (index, score) in enumerate(ranked_jobs, start=1):
    print(f"{rank}. {job_titles[index]} at {company_names[index]}->match:{score:.3f}" )

1. Academic Counsellor at EdiGlobe Online Services Pvt Ltd->match:0.623
2. Business Development Associate at Intellipaat (E- Learning)->match:0.595
3. Program Architect at Kalvium (Engineer)->match:0.587
4. Business Development Associate at KnoDTec Solutions->match:0.577
5. Software engineer at Prodapt->match:0.574
6. Business Development Intern at Terratern->match:0.544
7. data analyst at Atom Code->match:0.526
8. Solar Consultant at SolarSquare Energy Pvt Ltd->match:0.515
9. Lead Generation Executive at Skill Intern Pvt Ltd->match:0.475
10. Graduate Trainee Engineer at Adtran->match:0.452
11.  at ->match:0.036
12.  at ->match:0.036
13.  at ->match:0.036
14.  at ->match:0.036
