1. PDF Data Extraction

In [84]:
from  PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_file):
    """
    Extract text from a PDF file.

    Args:
        pdf_file (str): The path to the PDF file from which text will be extracted.

    Returns:
        str: The extracted text from the PDF.
    """
    # Create a PdfReader object to read the PDF file
    pdf_reader = PdfReader(pdf_file)
    
    # Initialize an empty string to store the extracted text
    extracted_text = ""

    # Iterate through each page in the PDF and extract text
    for page in pdf_reader.pages:
        extracted_text += page.extract_text()

    return extracted_text

In [85]:
pdf = "10985403.pdf"
print(extract_text_from_pdf(pdf))

MECHANICAL ENGINEERING INTERN
Summary
CAD | CAM | Finite Element Analysis | Mechanical Design | Product Design and Development
Skills
5 years of experience with CAD packages (SolidWorks, Autodesk Inventor, AutoCAD, CATIA, PTC CREO)
2.5 years of experience with CAE Softwares (HyperMesh, Abaqus, ANSYS, Optistruct)
2.5 years of experience with Analysis (Linear & Non-linear Static, Dynamic, GD & T, Tolerance Analysis, Design Optimization)
Experience with Sheet metal, Design for manufacturing, generating Bill of Materials, DFMEA, Sculpting.
Experience with advanced material selection for rapid prototyping, advanced manufacturing, welding and 3D printing.
Experience
09/2013
 
to 
05/2014
Company Name
Finite Element Analysis of Industrial Robotic Assembly, Illinois Institute of Technology, Chicago 
Jan - May 2016.
Conceptualized, brainstormed and designed a 6-axis SCARA Robot for pick and place operation in automotive industry.
Performed static analysis with stainless steel 304 to evaluate th

2. Job Description Data Understanding

In [86]:
import pandas as pd

df = pd.read_csv('training_data.csv')
df.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [87]:
output = df.drop(['description_length', "model_response"], axis=1)[:20]

In [88]:
import os 

def get_cv_texts(resume_directory="test"):
    cvs = []

    for filename in os.listdir(resume_directory):
        if filename.endswith(".pdf"):
            pdf_file_path = os.path.join(resume_directory, filename)
            cvs.append({"name" : pdf_file_path, "extracted_text" : extract_text_from_pdf(pdf_file_path) })

    return pd.DataFrame(cvs)


In [89]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



job_descriptions = output['job_description']
cvs = get_cv_texts()

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_job_descriptions = [tokenizer(des, padding=True, truncation=True, return_tensors='pt') for des in job_descriptions]
encoded_cvs = [tokenizer(cv, padding=True, truncation=True, return_tensors='pt') for cv in cvs["extracted_text"]]


# Compute token embeddings
with torch.no_grad():
    embedded_job_description = [model(**en_input) for en_input in encoded_job_descriptions]
    embedded_cvs = [model(**en_input) for en_input in encoded_cvs]


In [92]:
normalized_job_descriptions = []
normalized_cvs = []

# Perform pooling
for i in range(len(embedded_job_description)):
    normalized_job_descriptions.append(mean_pooling(embedded_job_description[i], encoded_job_descriptions[i]['attention_mask']))

for i in range(len(embedded_cvs)):
    normalized_cvs.append(mean_pooling(embedded_cvs[i], encoded_cvs[i]['attention_mask']))


# Normalize embeddings
normalized_job_descriptions = [F.normalize(se, p=2, dim=1) for se in normalized_job_descriptions]
normalized_cvs = [F.normalize(se, p=2, dim=1) for se in normalized_cvs]


In [93]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = []
for em in normalized_job_descriptions:
    ls = []
    
    for i in range(len(normalized_cvs)):
        ls.append((cvs["name"][i], cosine_similarity(em, normalized_cvs[i])[0][0]*100))
        ls.sort(key=lambda x: x[1], reverse=True)

    similarity.append(ls[:5])


In [102]:
output["similarity"] = similarity
output[["position_title", "similarity"]]

Unnamed: 0,position_title,similarity
0,Sales Specialist,"[(test\22776912.pdf, 61.507463455200195), (tes..."
1,Apple Solutions Consultant,"[(test\15119529.pdf, 47.96050190925598), (test..."
2,Licensing Coordinator - Consumer Products,"[(test\15119529.pdf, 56.1706006526947), (test\..."
3,Web Designer,"[(test\29147100.pdf, 57.078760862350464), (tes..."
4,Web Developer,"[(test\20001721.pdf, 68.19979548454285), (test..."
5,Frontend Web Developer,"[(test\29147100.pdf, 49.1132378578186), (test\..."
6,Remote Website Designer,"[(test\24610685.pdf, 66.08400344848633), (test..."
7,Web Designer,"[(test\27497542.pdf, 63.083481788635254), (tes..."
8,Web Designer,"[(test\24610685.pdf, 59.95955467224121), (test..."
9,SR. Web Designer,"[(test\11020140.pdf, 60.03981828689575), (test..."
