In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re 
import torch


from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.preprocessing import LabelEncoder

from transformers import DistilBertTokenizer, DistilBertModel

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
# def preprocess(sentence):
#     sentence=str(sentence)
#     sentence = sentence.lower()
#     sentence=sentence.replace('{html}',"")
#     cleanr = re.compile('<.*?>')
#     cleantext = re.sub(cleanr, '', sentence)
#     rem_url=re.sub(r'http\S+', '',cleantext)
#     rem_num = re.sub('[0-9]+', '', rem_url)
#     tokenizer = RegexpTokenizer(r'\w+')
#     tokens = tokenizer.tokenize(rem_num)  
#     filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
#     stem_words=[stemmer.stem(w) for w in filtered_words]
#     lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
#     return " ".join(filtered_words)

# print("--------PROCESSING TRAINING DATA--------")
# processed_df = df.copy()
# processed_df['resume_text'] = processed_df['resume_text'].map(lambda s: preprocess(s))
# processed_df['job_description_text'] = processed_df['job_description_text'].map(lambda s: preprocess(s))
# processed_df['combined_text'] = processed_df['resume_text'] + ' ' + processed_df['job_description_text']
# processed_df['label'] = le.fit_transform(processed_df['label'])

# print("--------PROCESSING TEST DATA--------")
# test_df = pd.read_csv('data/test.csv')
# processed_test_df = test_df.copy()

# processed_test_df['resume_text'] = processed_test_df['resume_text'].map(lambda s: preprocess(s))
# processed_test_df['job_description_text'] = processed_test_df['job_description_text'].map(lambda s: preprocess(s))
# processed_test_df['combined_text'] = processed_test_df['resume_text'] + ' ' + processed_test_df['job_description_text']
# processed_test_df['label'] = le.transform(processed_test_df['label'])

In [3]:
# processed_df.to_csv('data/processed_train.csv', index=False)
# processed_test_df.to_csv('data/processed_test.csv', index=False)

In [4]:
# # Making an evaluation set from the training set
# from sklearn.model_selection import train_test_split

# df_processed = pd.read_csv('data/processed_train.csv')
# train_data, eval_data = train_test_split(df_processed, test_size=0.2, random_state=42)
# train_data.to_csv('data/processed_train.csv', index=False)
# eval_data.to_csv('data/processed_eval.csv', index=False)

In [5]:
processed_df = pd.read_csv('data/processed_train.csv')
processed_eval = pd.read_csv('data/processed_eval.csv')
processed_test_df = pd.read_csv('data/processed_test.csv')

In [6]:
# # Load a pretrained model (BERT in this case) and its tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [7]:
class SentencePairDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        """
        df: Pandas DataFrame with columns 'resume_text', 'job_description_text', and 'label'
        """
        self.sentences_a = df['resume_text'].tolist()
        self.sentences_b = df['job_description_text'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        encoding_a = self.tokenizer(
            self.sentences_a[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        encoding_b = self.tokenizer(
            self.sentences_b[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        item = {
            'input_ids_a': encoding_a['input_ids'].squeeze(0),
            'attention_mask_a': encoding_a['attention_mask'].squeeze(0),
            'input_ids_b': encoding_b['input_ids'].squeeze(0),
            'attention_mask_b': encoding_b['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

In [8]:
processed_df.head()

Unnamed: 0,resume_text,job_description_text,label,combined_text
0,summarya business management graduate signific...,position title senior accountant organization ...,2,summarya business management graduate signific...
1,professional profilecapable international tax ...,rolegaming business analystresponsibilities ab...,1,professional profilecapable international tax ...
2,professional profilehighly motivated sales ass...,handle accounting responsibilities growing pro...,1,professional profilehighly motivated sales ass...
3,summaryorganized motivated employee eager appl...,client growing medical device company located ...,1,summaryorganized motivated employee eager appl...
4,summaryemployed navy civilian electrical engin...,seeking detail oriented analytical individual ...,1,summaryemployed navy civilian electrical engin...


In [9]:
# ----------- DATA LOADERS -----------------

train_dataset = SentencePairDataset(processed_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

eval_dataset = SentencePairDataset(processed_eval, tokenizer)
eval_loader = DataLoader(eval_dataset, batch_size=4, shuffle=False)

test_dataset = SentencePairDataset(processed_test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [10]:
class SBERT(nn.Module):
    def __init__(self, bert_model):
        super(SBERT, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(self.bert.config.hidden_size * 3, 3)  # 3 classes for classification

    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        outputs_a = self.bert(input_ids_a, attention_mask=attention_mask_a)
        pooled_output_a = outputs_a.last_hidden_state[:, 0, :]  # CLS token for sentence A

        outputs_b = self.bert(input_ids_b, attention_mask=attention_mask_b)
        pooled_output_b = outputs_b.last_hidden_state[:, 0, :]  # CLS token for sentence B

        abs_diff = torch.abs(pooled_output_a - pooled_output_b)
        combined = torch.cat((pooled_output_a, pooled_output_b, abs_diff), dim=1)
        logits = self.fc(combined)

        return logits  # Return raw logits; softmax will be applied in loss calculation

In [11]:
# Check for available devices and assign the appropriate one
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [12]:
print(device)

cuda


In [13]:
# Initialize SBERT model and send it to the selected device
sbert = SBERT(model).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(sbert.parameters(), lr=2e-5)

In [14]:
# Training loop with eval set included
num_epochs = 3
for epoch in range(num_epochs):
    sbert.train()
    epoch_loss = 0
    for batch in train_loader:
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = sbert(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}, Train Loss: {epoch_loss:.4f}")

    # Evaluation on eval set
    sbert.eval()
    eval_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids_a = batch['input_ids_a'].to(device)
            attention_mask_a = batch['attention_mask_a'].to(device)
            input_ids_b = batch['input_ids_b'].to(device)
            attention_mask_b = batch['attention_mask_b'].to(device)
            labels = batch['labels'].to(device)

            outputs = sbert(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
            loss = loss_fn(outputs, labels)
            eval_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
    
    eval_accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch + 1}, Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}")

Epoch 1, Train Loss: 1154.1459
Epoch 1, Eval Loss: 223.0265, Eval Accuracy: 0.6573
Epoch 2, Train Loss: 840.1487
Epoch 2, Eval Loss: 195.2219, Eval Accuracy: 0.7182
Epoch 3, Train Loss: 722.9590
Epoch 3, Eval Loss: 182.5196, Eval Accuracy: 0.7382


In [15]:
# Final test performance check
sbert.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels = batch['labels'].to(device)

        outputs = sbert(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        loss = loss_fn(outputs, labels)
        test_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 396.4787, Test Accuracy: 0.5571


In [16]:
eval_accuracy = [0.6573,0.7182,0.7382]
test_accuracy = 0.5571

print("Final Eval accuracy = ", sum(eval_accuracy) / len(eval_accuracy))
print("Final Test accuracy = ", test_accuracy)

Final Eval accuracy =  0.7045666666666666
Final Test accuracy =  0.5571


The cell below can be used to checkpoint/save the weights of the model.

In [None]:
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
# }, "model_and_optimizer_nn_1.pth")


# Load both model and optimizer state_dicts
# checkpoint = torch.load("model_and_optimizer.pth")
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])