In [1]:
import spacy
import glob
import os
import numpy as np
from docx import Document

In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.8/400.7 MB 6.7 MB/s eta 0:01:00
     ---------------------------------------- 2.4/400.7 MB 7.1 MB/s eta 0:00:57
     ---------------------------------------- 4.2/400.7 MB 7.9 MB/s eta 0:00:51
      --------------------------------------- 6.0/400.7 MB 7.8 MB/s eta 0:00:51
      --------------------------------------- 7.6/400.7 MB 8.0 MB/s eta 0:00:50
      --------------------------------------- 9.4/400.7 MB 8.2 MB/s eta 0:00:48
     - ------------------------------------- 11.8/400.7 MB 8.6 MB/s eta 0:00:46
     - ------------------------------------- 13.6/400.7 MB 8.7 MB/s eta 0:00:45
     - ------------------------------------- 15.5/400.7 MB 8.7 MB/s eta 0:00:45
     - -----------------------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
def load_docx_from_folder(folder_path, is_cv=True):
    documents = []
    filenames = []
    descriptions = []
    for filepath in glob.glob(os.path.join(folder_path, '*.docx')):
        text = extract_text_from_docx(filepath=filepath, is_cv=is_cv)
        text = text.replace('\n', ' ')
        text = text.replace('  ', ' ')
        if not is_cv:
            text = text.split("Benefits:")[0]
            description = text.split("Key Responsibilities:")[0]
        else:
            description = text.split("Project Experience")[1]
        documents.append(text)
        filenames.append(os.path.basename(filepath))
        descriptions.append(description)
    return documents, filenames, descriptions

In [5]:
def extract_text_from_docx(filepath, is_cv):
    doc = Document(filepath)
    return ' '.join([para.text for para in doc.paragraphs[1 if is_cv else 0:]])

In [6]:
# Set your folders here
cv_folder = './DataSet/cv'
job_folder = './DataSet/job_descriptions'

In [7]:
# Load documents
cv_texts, cv_files, cv_project_experiences = load_docx_from_folder(cv_folder)
job_texts, job_files, company_descriptions = load_docx_from_folder(job_folder, is_cv=False)

In [8]:
# Custom tokenizer using spaCy
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.text.isspace() and not token.text.isnumeric()]

In [9]:
cv_one = nlp(" ".join(spacy_tokenizer(cv_texts[0])))
job_one = nlp(" ".join(spacy_tokenizer(job_texts[0])))

In [10]:
type(job_one)

spacy.tokens.doc.Doc

In [11]:
job_one

job title senior ui ux designer company overview innovatetech solutions lead technology company dedicate create cut edge digital product enhance user experience platform team passionate innovation creativity deliver exceptional solution meet evolve need client pride foster collaborative inclusive work environment team member idea value contribute success key responsibilities lead design development user interface experience web mobile application ensure seamless intuitive user journey collaborate cross functional team include product manager developer designer translate business requirement innovative design solution conduct user research usability testing gather insight validate design concept iterate base feedback enhance user satisfaction create wireframe prototype high fidelity design industry standard design tool ensure consistency brand guideline design system mentor provide guidance junior designer foster culture continuous learning improvement design team stay update late ui ux

In [12]:
nlp(cv_one)

technical skills python tensorflow javascript reactjs aws sagemaker docker sql postgresql figma adobe xd foreign languages english c1 spanish b1 education university university politehnica bucharest program duration year master degree university politehnica bucharest program duration year certifications aws certified machine learning specialty docker certified associate tensorflow developer certificate project experience predictive analytics platform develop predictive analytic platform python tensorflow aim provide real time insight retail business leveraged aws sagemaker model training deployment ensure scalable efficient processing large dataset implement docker container streamline development deployment process enhance collaboration team technology tool python tensorflow aws sagemaker docker interactive web application data visualization create interactive web application visualize complex dataset javascript reactjs design intuitive user interface figma adobe xd focus enhance user

In [13]:
cv_one.similarity(job_one)

0.8850290179252625

In [14]:
cv_texts_preprocessed = [" ".join(spacy_tokenizer(cv_text)) for cv_text in cv_texts]
job_texts_preprocessed = [" ".join(spacy_tokenizer(job_text)) for job_text in job_texts]

In [15]:
job_text_preprocessed = nlp(job_texts_preprocessed[0])

In [16]:
job_text_preprocessed

job title senior ui ux designer company overview innovatetech solutions lead technology company dedicate create cut edge digital product enhance user experience platform team passionate innovation creativity deliver exceptional solution meet evolve need client pride foster collaborative inclusive work environment team member idea value contribute success key responsibilities lead design development user interface experience web mobile application ensure seamless intuitive user journey collaborate cross functional team include product manager developer designer translate business requirement innovative design solution conduct user research usability testing gather insight validate design concept iterate base feedback enhance user satisfaction create wireframe prototype high fidelity design industry standard design tool ensure consistency brand guideline design system mentor provide guidance junior designer foster culture continuous learning improvement design team stay update late ui ux

In [17]:
spacy_scores = [job_text_preprocessed.similarity(nlp(cv_text_preprocessed)) for cv_text_preprocessed in cv_texts_preprocessed]

In [18]:
# Get top N (e.g., top 10 semantically relevant)
top_n = 10
top_indices = np.argsort(spacy_scores)[-top_n:][::-1]
filtered_cvs = [cv_texts[i] for i in top_indices]

In [19]:
top_indices

array([265, 373, 311, 437, 200, 441, 191, 415,  81, 160])

In [20]:
similarities_embeddings = [spacy_scores[i] for i in top_indices]

In [21]:
similarities_embeddings

[0.9647905230522156,
 0.963344395160675,
 0.9630864858627319,
 0.9598843455314636,
 0.9560001492500305,
 0.9556804299354553,
 0.9551587700843811,
 0.9549747109413147,
 0.9548625946044922,
 0.9547834396362305]

In [22]:
spacy_scores[160]

0.9547834396362305

In [23]:
cv_texts[265]

'Technical Skills - JavaScript, ReactJS - HTML, CSS, Bootstrap - Figma, Adobe XD - Sketch, InVision Foreign Languages - English: C1 - Spanish: B2 Education - University Name: University of Bucharest - Program Duration: 4 years - Master Degree Name: University of Bucharest - Program Duration: 2 years Certifications - Microsoft Certified: Power Platform Fundamentals - Adobe Certified Associate (ACA): Visual Design Using Adobe XD - Certified Internet Web Professional (CIW) Site Development Associate Project Experience 1. Interactive Portfolio Website  Developed a personal portfolio website using HTML, CSS, and Bootstrap to showcase design and development projects. Implemented interactive features with JavaScript and ReactJS to enhance user engagement, such as dynamic content loading and smooth scrolling effects. Utilized Figma and Adobe XD for designing the layout and ensuring a responsive design across various devices. The project helped in improving my skills in front-end development an

In [24]:
filtered_cvs[0]

'Technical Skills - JavaScript, ReactJS - HTML, CSS, Bootstrap - Figma, Adobe XD - Sketch, InVision Foreign Languages - English: C1 - Spanish: B2 Education - University Name: University of Bucharest - Program Duration: 4 years - Master Degree Name: University of Bucharest - Program Duration: 2 years Certifications - Microsoft Certified: Power Platform Fundamentals - Adobe Certified Associate (ACA): Visual Design Using Adobe XD - Certified Internet Web Professional (CIW) Site Development Associate Project Experience 1. Interactive Portfolio Website  Developed a personal portfolio website using HTML, CSS, and Bootstrap to showcase design and development projects. Implemented interactive features with JavaScript and ReactJS to enhance user engagement, such as dynamic content loading and smooth scrolling effects. Utilized Figma and Adobe XD for designing the layout and ensuring a responsive design across various devices. The project helped in improving my skills in front-end development an

In [25]:
top_embedding_matching_cvs = [cv_texts_preprocessed[i] for i in top_indices]

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
# Combine both for training
combined_cv_job_description_preprocessed = top_embedding_matching_cvs + [job_texts_preprocessed[0]]

In [28]:
# Fit vectorizer on combined
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_cv_job_description_preprocessed)

In [29]:
# Now split it back
cv_vectors = tfidf_matrix[:len(top_embedding_matching_cvs)]
job_vectors = tfidf_matrix[len(top_embedding_matching_cvs):]

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
similarities_tfidf = cosine_similarity(job_vectors, cv_vectors)

In [32]:
print(similarities_tfidf.shape)

(1, 10)


In [33]:
similarities_tfidf[0].shape

(10,)

In [34]:
np.array(similarities_tfidf).reshape(-1, 1)

array([[0.37934255],
       [0.30752267],
       [0.33582187],
       [0.21218436],
       [0.38944961],
       [0.23292916],
       [0.35446832],
       [0.2195414 ],
       [0.30486361],
       [0.3013252 ]])

In [35]:
from sklearn.preprocessing import MinMaxScaler

In [36]:
np.array(similarities_embeddings).max() - np.array(similarities_embeddings).min()

np.float64(0.010007083415985107)

In [37]:
scaler = MinMaxScaler()

In [66]:
#similarities_embeddings_scaled = scaler.fit_transform(np.array(similarities_embeddings).reshape(-1, 1)).flatten()

In [67]:
similarities_tfidf_scaled = scaler.fit_transform(np.array(similarities_tfidf).reshape(-1, 1)).flatten()

In [68]:
similarities_tfidf_scaled

array([0.94298338, 0.53782853, 0.69747178, 0.        , 1.        ,
       0.11702691, 0.80266129, 0.04150303, 0.52282809, 0.50286695])

In [71]:
np.array(similarities_embeddings).reshape(-1, 1)

array([[0.96479052],
       [0.9633444 ],
       [0.96308649],
       [0.95988435],
       [0.95600015],
       [0.95568043],
       [0.95515877],
       [0.95497471],
       [0.95486259],
       [0.95478344]])

In [41]:
final_scores = 0.6 * similarities_embeddings_scaled + 0.4 * similarities_tfidf_scaled

In [42]:
final_scores

array([0.97719335, 0.72842516, 0.77681885, 0.30583772, 0.4729509 ,
       0.10059209, 0.3435684 , 0.02806937, 0.21387717, 0.20114678])

In [43]:
top_indices

array([265, 373, 311, 437, 200, 441, 191, 415,  81, 160])

In [44]:
final_scores_indices = np.argsort(final_scores)[::-1][:5]

In [45]:
final_scores_indices

array([0, 2, 1, 4, 6])

In [46]:
final_scores_indices

array([0, 2, 1, 4, 6])

In [47]:
for i in final_scores_indices:
    print(f"{top_indices[i]} - {final_scores[i]}")

265 - 0.9771933509787771
311 - 0.7768188525911273
373 - 0.7284251566083348
200 - 0.47295090266899265
191 - 0.34356840130686733


In [48]:
cv_texts[265]

'Technical Skills - JavaScript, ReactJS - HTML, CSS, Bootstrap - Figma, Adobe XD - Sketch, InVision Foreign Languages - English: C1 - Spanish: B2 Education - University Name: University of Bucharest - Program Duration: 4 years - Master Degree Name: University of Bucharest - Program Duration: 2 years Certifications - Microsoft Certified: Power Platform Fundamentals - Adobe Certified Associate (ACA): Visual Design Using Adobe XD - Certified Internet Web Professional (CIW) Site Development Associate Project Experience 1. Interactive Portfolio Website  Developed a personal portfolio website using HTML, CSS, and Bootstrap to showcase design and development projects. Implemented interactive features with JavaScript and ReactJS to enhance user engagement, such as dynamic content loading and smooth scrolling effects. Utilized Figma and Adobe XD for designing the layout and ensuring a responsive design across various devices. The project helped in improving my skills in front-end development an

In [49]:
job_texts[0]

"Job Title: Senior UI/UX Designer Company Overview: InnovateTech Solutions is a leading technology company dedicated to creating cutting-edge digital products that enhance user experiences across various platforms. Our team is passionate about innovation, creativity, and delivering exceptional solutions that meet the evolving needs of our clients. We pride ourselves on fostering a collaborative and inclusive work environment where every team member's ideas are valued and contribute to our success. Key Responsibilities: - Lead the design and development of user interfaces and experiences for web and mobile applications, ensuring a seamless and intuitive user journey. - Collaborate with cross-functional teams, including product managers, developers, and other designers, to translate business requirements into innovative design solutions. - Conduct user research and usability testing to gather insights and validate design concepts, iterating based on feedback to enhance user satisfaction.

In [50]:
domain_examples = {
    "Banking": "Developed or tested software for financial systems, online banking platforms, payment gateways, or trading applications.",
    "Healthcare": "Built or maintained applications for hospitals, electronic health records (EHR), medical imaging, or telemedicine platforms.",
    "E-commerce": "Worked on online shopping platforms, shopping carts, recommendation engines, or payment integrations.",
    "Education": "Built learning management systems (LMS), e-learning platforms, virtual classrooms, or student data portals.",
    "Telecommunications": "Developed tools for mobile networks, VoIP systems, call centers, or network monitoring.",
    "Retail": "Created point-of-sale (POS) software, inventory systems, or customer loyalty platforms for retail chains.",
    "Insurance": "Built claims processing systems, policy management tools, or customer self-service insurance portals.",
    "Manufacturing": "Worked on production line automation, MES systems, or supply chain tracking applications.",
    "Government": "Contributed to e-government portals, public service automation, or digital identity platforms.",
    "Transportation & Logistics": "Built fleet management, delivery tracking, or logistics optimization systems.",
    "Energy & Utilities": "Developed SCADA systems, energy consumption dashboards, or smart grid monitoring tools.",
    "Legal": "Built document management, case tracking, or legal research platforms for law firms or legal departments.",
    "Real Estate": "Created property listing platforms, real estate CRMs, or mortgage application systems.",
    "Media & Entertainment": "Developed streaming platforms, content management systems, or digital publishing tools.",
    "Finance (non-banking)": "Built accounting software, budgeting tools, or investment tracking platforms."
}

In [51]:
domains = [
    "Banking",
    "Healthcare",
    "E-commerce",
    "Telecommunications",
    "Education",
    "Retail",
    "Insurance",
    "Legal",
    "Manufacturing",
    "Transportation & Logistics",
    "Energy & Utilities",
    "Real Estate",
    "Government",
    "Transportation",
    "Marketing",
    "Media & Entertainment",
    "Construction",
    "Finance (non-banking)"
]

In [52]:
from sentence_transformers import SentenceTransformer, util

In [53]:
model = SentenceTransformer("all-MiniLM-L6-v2")  # fast & accurate

In [54]:
job_description = """We are seeking a full-stack engineer to build scalable software tools for healthcare diagnostics using cloud-based machine learning pipelines."""

In [55]:
domain_data = { 
    "Banking": { "desc": "Developed software for banking platforms, digital wallets, loan processing, or credit systems.", 
                 "keywords": ["bank", "loan", "credit", "atm", "fintech", "interest", "account", "ledger"] }, 
    "Healthcare": { "desc": "Built medical systems such as EHR, hospital platforms, clinical apps, or telemedicine services.", 
                    "keywords": ["healthcare", "ehr", "patient", "hospital", "clinic", "medical", "doctor", "nurse"] }, 
    "E-commerce": { "desc": "Created platforms or tools for online stores, shopping carts, payments, or product discovery.", 
                    "keywords": ["ecommerce", "checkout", "cart", "payment", "shopify", "woocommerce", "product", "sku"] }, 
    "Telecommunications": { "desc": "Engineered tools for telecom networks, call management, VoIP, or network monitoring.", 
                            "keywords": ["telecom", "sms", "voip", "5g", "network", "bandwidth", "subscriber", "lte"] }, 
    "Education": { "desc": "Built education platforms such as learning portals, student dashboards, or LMS systems.", 
                   "keywords": ["education", "student", "teacher", "learning", "course", "classroom", "lms", "school"] }, 
    "Retail": { "desc": "Developed software for retail businesses such as POS systems, inventory, or loyalty programs.", 
                "keywords": ["retail", "store", "inventory", "pos", "stock", "sku", "receipt", "shopping"] }, 
    "Insurance": { "desc": "Created applications for policy management, claims processing, or underwriting systems.", 
                   "keywords": ["insurance", "claim", "policy", "underwriting", "premium", "broker", "risk"] }, 
    "Legal": { "desc": "Built tools for legal case management, document processing, or e-discovery platforms.", 
               "keywords": ["legal", "law", "contract", "case", "compliance", "jurisdiction", "litigation"] }, 
    "Manufacturing": { "desc": "Developed automation systems, supply chain tools, or MES solutions for production plants.", 
                       "keywords": ["manufacturing", "plant", "automation", "mes", "machine", "factory", "assembly"] }, 
    "Transportation & Logistics": { "desc": "Built platforms for delivery tracking, fleet management, logistics optimization, or routing.", 
                                    "keywords": ["logistics", "delivery", "routing", "fleet", "dispatch", "transport", "warehouse"] }, 
    "Energy & Utilities": { "desc": "Developed monitoring systems, SCADA platforms, or analytics for power and water utilities.", 
                            "keywords": ["energy", "power", "electricity", "gas", "grid", "meter", "solar", "utility", "scada"] }, 
    "Real Estate": { "desc": "Engineered platforms for property listings, CRM tools for agents, or real estate analytics.", 
                     "keywords": ["real estate", "property", "mortgage", "agent", "listing", "tenant", "lease"] }, 
    "Government": { "desc": "Built public service portals, civic data dashboards, or digital identity platforms.", 
                    "keywords": ["government", "municipal", "civic", "permit", "id", "citizen", "registry"] }, 
    "Marketing": { "desc": "Built marketing analytics tools, email campaign platforms, or digital ad performance systems.", 
                   "keywords": ["marketing", "campaign", "seo", "email", "ads", "promotion", "branding", "targeting"] }, 
    "Media & Entertainment": { "desc": "Created digital content platforms, streaming services, or entertainment production tools.", 
                               "keywords": ["media", "streaming", "video", "music", "entertainment", "broadcast", "subscriber"] }, 
    "Construction": { "desc": "Engineered project management tools, BIM integrations, or field apps for construction teams.", 
                      "keywords": ["construction", "site", "blueprint", "project", "bim", "architect", "contractor"] }, 
    "Finance (non-banking)": { "desc": "Worked on accounting systems, budgeting tools, payroll, or financial planning platforms.", 
                               "keywords": ["finance", "accounting", "budget", "payroll", "invoice", "expense", "audit", "report"] }
}

In [56]:
domain_names = list(domain_data.keys())
domain_embeddings = model.encode([domain_data[d]["desc"] for d in domain_names], convert_to_tensor=True)

In [57]:
domain_embeddings.shape

torch.Size([17, 384])

In [58]:
def match_domains(text, top_k=5, keyword_boost=0.1): 
    text_lower = text.lower() 
    text_embedding = model.encode(text, convert_to_tensor=True)
    # Step 1: Semantic similarity
    cos_scores = util.cos_sim(text_embedding, domain_embeddings)[0]
    
    # Step 2: Count keyword matches
    scores = []
    for i, domain in enumerate(domain_names):
        key_matches = sum(1 for kw in domain_data[domain]["keywords"] if kw in text_lower)
        score = float(cos_scores[i]) + key_matches * keyword_boost
        if score > 0.6:
            scores.append((domain, 1))
        elif score >= 0.5:
            scores.append((domain, score))
    
    # Step 3: Return top_k
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

In [59]:
text = """ Predictive Analytics Platform: Developed a predictive analytics platform using Python and TensorFlow, aimed at providing real-time insights for retail businesses. Used AWS SageMaker and Docker. """

In [60]:
job_description = company_descriptions[0]

In [61]:
job_description = cv_project_experiences[1]

In [62]:
job_description

' 1. Inventory Management System  Developed a robust inventory management system using Java and Spring Boot, designed to streamline warehouse operations and improve stock tracking efficiency. Implemented RESTful APIs to facilitate seamless integration with third-party logistics providers and utilized PostgreSQL for reliable data storage and retrieval. Deployed the application on AWS using Docker containers, ensuring scalability and high availability. Technologies and tools used: Java, Spring Boot, PostgreSQL, AWS, Docker. 2. Real-time Chat Application  Built a real-time chat application using ReactJS and TypeScript, providing users with a responsive and interactive messaging experience. Integrated WebSocket technology to enable instant message delivery and updates. Leveraged AWS services for backend support and data storage, ensuring secure and reliable communication. Technologies and tools used: ReactJS, TypeScript, WebSocket, AWS.'

In [63]:
results = match_domains(job_description)

In [64]:
for domain, score in results:
    print(f"{domain}: {score:.3f}")

Retail: 1.000
Transportation & Logistics: 1.000


In [65]:
results.empty()

AttributeError: 'list' object has no attribute 'empty'

In [None]:
cv_domain_experience_scores = {}

In [None]:
for index in range(len(cv_files)):
    results = match_domains(cv_project_experiences[index])
    if len(results) != 0:
        cv_domain_experience_scores[cv_files[index]] = {}
        for domain, score in results:
            cv_domain_experience_scores[cv_files[index]][domain] = score

In [None]:
cv_domain_experience_scores

In [None]:
import json

In [None]:
with open("cv_domain_experience_scores.json", "w") as f:
    json.dump(cv_domain_experience_scores, f, indent=2)

In [None]:
with open("cv_domain_experience_scores.json", "r") as f:
    cv_loaded_dict = json.load(f)

In [None]:
cv_loaded_dict

In [None]:
job_domain_experience_scores = {}

In [None]:
for index in range(len(job_files)):
    results = match_domains(company_descriptions[index])
    if len(results) != 0:
        job_domain_experience_scores[job_files[index]] = {}
        for domain, score in results:
            job_domain_experience_scores[job_files[index]][domain] = score

In [None]:
job_domain_experience_scores['job_description_17_Backend Developer.docx']

In [None]:
company_descriptions[17]

In [79]:
import re

In [80]:
custom_skills = [("Tensorflow", 60), ("AWS", 20), ("ML", 20)]

In [81]:
def get_cv_keyword_matching_scores(custom_skills, cv_texts):
    scores = [get_keyword_matching_scores(custom_skills, cv_text) for cv_text in cv_texts]
    return np.array(scores)

In [104]:
def get_keyword_matching_scores(custom_skills, cv_text):
    # Compute weighted match
    skill_matches = {}
    for skill, weight in custom_skills:
        count = len(re.findall(rf"\b{re.escape(skill.lower())}\b", cv_text.lower()))
        skill_matches[skill] = (count > 0, weight)
    # Score computation
    total_score = sum(weight for matched, weight in skill_matches.values() if matched)
    return total_score

In [105]:
get_keyword_matching_scores(custom_skills, cv_texts[321])

80

In [101]:
re.findall(rf"\bTensorFlow\b", cv_texts[321])

['TensorFlow', 'TensorFlow', 'TensorFlow', 'TensorFlow']

In [77]:
cv_texts[321]

"Technical Skills Python, TensorFlow: 4 JavaScript, ReactJS: 3 AWS SageMaker, Docker: 2 SQL, PostgreSQL: 3 Figma, Adobe XD: 2 Foreign Languages - English: C1 - Spanish: B2 - French: A2 Education - University Name: University Politehnica of Bucharest - Program Duration: 4 years In this program, students acquire skills in Python, TensorFlow, JavaScript, ReactJS, AWS SageMaker, Docker, SQL, PostgreSQL, Figma, and Adobe XD. (Note: The mean of the technical skills is not high enough to warrant a master's degree suggestion.) Certifications - AWS Certified Cloud Practitioner - Microsoft Certified: Azure AI Fundamentals Project Experience 1. Machine Learning Model for Predictive Analysis  Developed a machine learning model using Python and TensorFlow to predict student performance based on historical data. The project involved data preprocessing, feature selection, and model training, achieving an accuracy of 85%. Deployed the model on AWS SageMaker, leveraging its capabilities to streamline t