In [None]:
# The links to the dataset can be found here:
# https://www.kaggle.com/datasets/jillanisofttech/updated-resume-dataset
# https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
# https://www.kaggle.com/datasets/arshkon/linkedin-job-postings

# I combined the resume-dataset and updated-resume-dataset into resumes.csv.

In [2]:
%load_ext memory_profiler
%memit
import os
import sys
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow as tf
import logging
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import ssl
import nltk
import hashlib
import json
import time
from openai import OpenAI

client = OpenAI(api_key="sk-proj-sScFMZr4GK3XaQ5mcYFYT3BlbkFJ1orIjm1gKSNT0q30KBMZ")

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
peak memory: 484.30 MiB, increment: 0.53 MiB


In [None]:
%%memit
postings = pd.read_csv("data/trunc-postings.csv")
resumes = pd.read_csv("data/trunc-resumes.csv")
postings = postings.head(3)
resumes = resumes.head(3)

In [None]:
%%memit

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text_series):
    logging.info("Preprocessing text...")
    text_series = text_series.fillna("")  # Replace NaN with empty strings
    text_series = text_series.apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text_series = text_series.apply(lambda x: ' '.join(
        lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words
    ))
    return text_series

resumes['Resume'] = preprocess_text(resumes['Resume'])
postings['description'] = preprocess_text(postings['description'])

display(resumes.head(3))
display(postings.head(3))

In [None]:
%%memit
universal_sentence_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def vectorize_text(text_series):
    logging.info("Vectorizing text...")
    embeddings = universal_sentence_encoder(text_series.tolist()).numpy()
    return embeddings

vectorized_resumes = vectorize_text(resumes['Resume'])
vectorized_postings = vectorize_text(postings['description'])

print(f'vectorized resumes: {type(vectorized_resumes)}, {vectorized_resumes.nbytes}')
print(f'vectorized postings: {type(vectorized_postings)}, {vectorized_postings.nbytes}')

In [None]:
%%memit
resume_posting_pairs = [(vectorized_resumes[i], vectorized_postings[j])
                for i in range(len(resumes)) for j in range(len(postings))]
# Ensure vector pairs are numpy arrays
resume_posting_pairs = [(np.array(left), np.array(right)) for left, right in resume_posting_pairs]

print(f'resume_posting_pairs: {type(resume_posting_pairs)}, {len(resume_posting_pairs)}')

In [None]:
%%memit
if os.path.exists('cache.json'):
    with open('cache.json', 'r') as cache_file:
        cache = json.load(cache_file)
else:
    cache = {}
    with open('cache.json', 'w') as cache_file:
        json.dump(cache, cache_file)

def generate_labels(descriptions, resumes):
    logging.info("Generating labels...")
    labels = []

    for i, description in enumerate(descriptions):
        for j, resume in enumerate(resumes):
            key = hashlib.md5(f"{resume}{description}".encode()).hexdigest()

            if key in cache:
                relevance = cache[key]
            else:
                prompt = f"How relevant is this resume to the job description? {resume} {description}"

                try:
                    response = client.chat.completions.create(
                        model="gpt-3.5-turbo",
                        messages=[
                            {"role": "system", "content": "Analyze the relevance of the resume to the job description."},
                            {"role": "user", "content": prompt}
                        ]
                    )
                    relevance_text = response.choices[0].message.content.strip()
                    relevance = 1 if any(phrase in relevance_text for phrase in ['highly relevant', 'very relevant', 'extremely relevant']) else 0
                    cache[key] = relevance
                except Exception as e:
                    print(f'Error {e}')

            labels.append(relevance)

    with open('cache.json', 'w') as cache_file:
        json.dump(cache, cache_file)

    return labels

labels = generate_labels(postings['description'], resumes['Resume'])
print(labels)

# ML Model (needs nvidia driver)
device = torch.device('cuda'))
device = torch.device('cuda')
train_pairs, test_pairs, train_labels, test_labels = train_test_split(resume_posting_pairs, labels, test_size=0.2, random_state=42)
logging.info("Data split into training and testing sets.")
print("Training and testing data prepared.")

train_left = np.array([x[0] for x in train_pairs])
train_right = np.array([x[1] for x in train_pairs])
test_left = np.array([x[0] for x in test_pairs])
test_right = np.array([x[1] for x in test_pairs])

train_data = (torch.tensor(train_left, dtype=torch.float32), torch.tensor(train_right, dtype=torch.float32))
test_data = (torch.tensor(test_left, dtype=torch.float32), torch.tensor(test_right, dtype=torch.float32))

train_labels = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)
test_labels = torch.tensor(test_labels, dtype=torch.float32).unsqueeze(1)

print("Train Data Shape:", (train_left.shape, train_right.shape))
print("Train Data Type:", (type(train_left), type(train_right)))
print("Train Labels Shape:", len(train_labels))
print("Train Labels Type:", type(train_labels))
print("Test Data Shape:", (test_left.shape, test_right.shape))
print("Test Data Type:", (type(test_left), type(test_right)))
print("Test Labels Shape:", len(test_labels))
print("Test Labels Type:", type(test_labels))

class SiameseNetwork(nn.Module):
    def __init__(self, input_size):
        super(SiameseNetwork, self).__init__()
        self.shared_network = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.output_layer = nn.Linear(64, 1)

    def forward_one_side(self, x):
        return self.shared_network(x)

    def forward(self, input1, input2):
        output1 = self.forward_one_side(input1)
        output2 = self.forward_one_side(input2)
        distance = torch.abs(output1 - output2)
        output = torch.sigmoid(self.output_layer(distance))
        return output

input_size = train_left.shape[1]

model = SiameseNetwork(input_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
batch_size = 8

train_dataset = torch.utils.data.TensorDataset(train_data[0], train_data[1], train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(test_data[0], test_data[1], test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

best_val_loss = float('inf')
patience_counter = 0

logging.info("Starting model training...")
print("Starting model training...")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_idx, (left, right, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(left.to(device), right.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_idx, (left, right, labels) in enumerate(test_loader):
            outputs = model(left.to(device), right.to(device))
            loss = criterion(outputs, labels.to(device))
            val_loss += loss.item()

    val_loss /= len(test_loader)

    logging.info(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}, Validation Loss: {val_loss}")
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'siamese_network.pth')
        logging.info("Model saved.")
        print("Model saved.")
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            logging.info("Early stopping triggered.")
            print("Early stopping triggered.")
            break

logging.info("Model training completed.")
print("Model training completed.")

model.load_state_dict(torch.load('siamese_network.pth', map_location=device))
logging.info("Best model loaded.")
print("Best model loaded.")