In [1]:
import os
import cv2
import re
import pickle
from glob import glob
from tqdm import tqdm
from dotenv import load_dotenv
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd

import pytesseract
from pdf2image import convert_from_path
from pypdf import PdfReader
from sklearn.metrics.pairwise import cosine_similarity
from langchain_google_vertexai import VertexAIEmbeddings

%load_ext autoreload
%autoreload 2

from src.utils.config import settings

load_dotenv()

True

In [2]:
class FileReader:
    def __init__(self, jd_path, resume_path):
        self.jds_path = jd_path
        self.resumes_path = resume_path

        self.jd_data = defaultdict(str)
        self.resume_data = defaultdict(str)

    def read_jd_data(self):
        files = glob(self.jds_path, recursive=True)
        for file in files:
            with open(file, 'rb') as f:
                data = f.read().strip().lower()
                data = b'{data}'.decode("utf-8")
                job_label = file.split('/')[-1].replace('.txt', '')
                self.jd_data[job_label] = data
        return self.jd_data

    def read_resume_data(self):
        files = glob(self.resumes_path, recursive=True)
        for file in files:
            reader = PdfReader(file)
            data = []
            for page in reader.pages:
                data.append(page.extract_text())
            data = '\n'.join(data).strip().lower()
            job_title = file.split('/')[-2].replace(' ', '_').lower()
            resume_name = file.split('/')[-1].replace('-', '_').lower().replace('.pdf', '')

            if len(data) > 1:
                self.resume_data[f'{job_title}_{resume_name}'] = data
            else:
                pages = convert_from_path(file)
                extracted_text = []
                for page in pages:
                    preprocessed_img = self.deskew(np.array(page))
                    text = self.extract_text_from_img(preprocessed_img)
                    extracted_text.append(text)
                self.resume_data[f'{job_title}_{resume_name}'] = '\n'.join(extracted_text).strip().lower()

        return self.resume_data

    def deskew(self, img):
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.bitwise_not(gray)
        coords = np.column_stack(np.where(gray > 0))
        angle = cv2.minAreaRect(coords)[-1]

        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = img.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated

    def extract_text_from_img(self, img):
        return  pytesseract.image_to_string(img)

In [3]:
file_reader = FileReader(settings.JD_PATH, settings.RESUME_PATH)
jd_data = file_reader.read_jd_data()
resume_data = file_reader.read_resume_data()

incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
parsing for Object Streams


In [4]:
print(f'Total JDs: {len(jd_data)}')
print(f'Total RESUMES: {len(resume_data)}')

Total JDs: 32
Total RESUMES: 88


In [5]:
class EmbeddingModel:
    def __init__(self):
        self.embedding_model = VertexAIEmbeddings(settings.EMBEDDING_MODEL_NAME)

    def get_embedding_model(self, data: dict):
        output = {}
        keys = list(data.keys())
        docs = list(data.values())
        embeddings = self.embedding_model.embed_documents(docs)
        for i in range(len(keys)):
            output[keys[i]] = embeddings[i]
        return output

    def save_embedding(self, embedding, file_name):
        with open(settings.OUTPUT_PATH + file_name, 'wb') as f:
            pickle.dump(embedding, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_embedding(self, file_name):
        with open(settings.OUTPUT_PATH + file_name, 'rb') as f:
            embedding = pickle.load(f)
        return embedding

In [6]:
embedding_model = EmbeddingModel()
jd_embedding = embedding_model.get_embedding_model(jd_data)
embedding_model.save_embedding(jd_embedding, settings.JD_EMBEDDING_FILE_NAME)
resume_embedding = embedding_model.get_embedding_model(resume_data)
embedding_model.save_embedding(resume_embedding, settings.RESUME_EMBEDDING_FILE_NAME)

### Load Embeddings

In [7]:
jd_embedding = embedding_model.load_embedding(settings.JD_EMBEDDING_FILE_NAME)
resume_embedding = embedding_model.load_embedding(settings.RESUME_EMBEDDING_FILE_NAME)

### Calculating Cosine Similarity

In [8]:
resume_jd_match = {
    'data_engineer': 'de', 'data_analyst': 'ddataanalyst', 'big_data_analyst': 'bigdataanalyst', 'mlops_engineer': 'mlops',
    'data_scientist': 'ds', 'data_architect': 'da', 'machine_learning_engineer': 'mle', 'business_intelligence_analyst':'bianalyst'
}
jd_pattern = re.compile(r'\d+_[a-z]+$')
resume_pattern = re.compile(r'_resume_\d+$')

In [9]:
SIMILARITY_DICT = {}
for jd_name in jd_embedding.keys():
    for resume_name in resume_embedding.keys():
        cleaned_jd_name = jd_pattern.sub('', jd_name)
        cleaned_resume_name = resume_pattern.sub('', resume_name)
        print(cleaned_jd_name, cleaned_resume_name)
        if resume_jd_match[cleaned_resume_name] == cleaned_jd_name:
            similarity_score = cosine_similarity(
                np.array(jd_embedding[jd_name]).reshape(1, -1),
                np.array(resume_embedding[resume_name]).reshape(1, -1))[0][0]
            if resume_name not in SIMILARITY_DICT:
                SIMILARITY_DICT[resume_name] = {}
            SIMILARITY_DICT[resume_name][jd_name] = {'score': similarity_score}
SIMILARITY_DICT.keys()

bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst data_architect
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst machine_learning_engineer
bigdataanalyst mlops_engineer
bigdataanalyst mlops_engineer
bigdataanalyst mlops_engineer
bigdataanalyst mlops_engineer
bigdataanalyst mlops_engineer
bigdataanalyst data_engineer
bigdataanalyst data_engineer
bigdataanal

dict_keys(['big_data_analyst_resume_1', 'big_data_analyst_resume_2', 'big_data_analyst_resume_3', 'machine_learning_engineer_resume_73', 'machine_learning_engineer_resume_77', 'machine_learning_engineer_resume_76', 'machine_learning_engineer_resume_74', 'machine_learning_engineer_resume_75', 'machine_learning_engineer_resume_78', 'machine_learning_engineer_resume_79', 'machine_learning_engineer_resume_81', 'machine_learning_engineer_resume_80', 'machine_learning_engineer_resume_82', 'machine_learning_engineer_resume_83', 'mlops_engineer_resume_88', 'mlops_engineer_resume_87', 'mlops_engineer_resume_86', 'mlops_engineer_resume_84', 'mlops_engineer_resume_85', 'business_intelligence_analyst_resume_12', 'business_intelligence_analyst_resume_13', 'business_intelligence_analyst_resume_11', 'business_intelligence_analyst_resume_10', 'business_intelligence_analyst_resume_8', 'business_intelligence_analyst_resume_9', 'business_intelligence_analyst_resume_7', 'business_intelligence_analyst_resu

In [10]:
def get_top_matching_jobs(resume_name):
    score_list = [[key, SIMILARITY_DICT[resume_name][key]['score']] for key in SIMILARITY_DICT[resume_name].keys()]
    score_list.sort(key=lambda x: x[1], reverse=True)
    return score_list[0]

In [11]:
get_top_matching_jobs('big_data_analyst_resume_1')

['bigdataanalyst2_maveric', np.float64(0.7090714213045981)]