In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re 
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('data/train.csv')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
le = LabelEncoder()

# def preprocess(sentence):
#     sentence=str(sentence)
#     sentence = sentence.lower()
#     sentence=sentence.replace('{html}',"")
#     cleanr = re.compile('<.*?>')
#     cleantext = re.sub(cleanr, '', sentence)
#     rem_url=re.sub(r'http\S+', '',cleantext)
#     rem_num = re.sub('[0-9]+', '', rem_url)
#     tokenizer = RegexpTokenizer(r'\w+')
#     tokens = tokenizer.tokenize(rem_num)  
#     filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
#     stem_words=[stemmer.stem(w) for w in filtered_words]
#     lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
#     return " ".join(filtered_words)

# print("--------PROCESSING TRAINING DATA--------")
# processed_df = df.copy()
# processed_df['resume_text'] = processed_df['resume_text'].map(lambda s: preprocess(s))
# processed_df['job_description_text'] = processed_df['job_description_text'].map(lambda s: preprocess(s))
# processed_df['combined_text'] = processed_df['resume_text'] + ' ' + processed_df['job_description_text']
# processed_df['label'] = le.fit_transform(processed_df['label'])

# print("--------PROCESSING TEST DATA--------")
# test_df = pd.read_csv('data/test.csv')
# processed_test_df = test_df.copy()

# processed_test_df['resume_text'] = processed_test_df['resume_text'].map(lambda s: preprocess(s))
# processed_test_df['job_description_text'] = processed_test_df['job_description_text'].map(lambda s: preprocess(s))
# processed_test_df['combined_text'] = processed_test_df['resume_text'] + ' ' + processed_test_df['job_description_text']
# processed_test_df['label'] = le.transform(processed_test_df['label'])

In [19]:
# processed_df.to_csv('data/processed_train.csv', index=False)
# processed_test_df.to_csv('data/processed_test.csv', index=False)

In [None]:
# # Making an evaluation set from the training set
# from sklearn.model_selection import train_test_split

# df_processed = pd.read_csv('data/processed_train.csv')
# train_data, eval_data = train_test_split(df_processed, test_size=0.2, random_state=42)
# train_data.to_csv('data/processed_train.csv', index=False)
# eval_data.to_csv('data/processed_eval.csv', index=False)

In [20]:
processed_df = pd.read_csv('data/processed_train.csv')
processed_test_df = pd.read_csv('data/processed_test.csv')

In [21]:
import torch
# from transformers import BertTokenizer, BertModel

# # Load a pretrained model (BERT in this case) and its tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [22]:
from torch.utils.data import Dataset

class SentencePairDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        """
        df: Pandas DataFrame with columns 'resume_text', 'job_description_text', and 'label'
        """
        self.sentences_a = df['resume_text'].tolist()
        self.sentences_b = df['job_description_text'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # Tokenize the resume (sentence_a) and job description (sentence_b)
        encoding_a = self.tokenizer(
            self.sentences_a[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        encoding_b = self.tokenizer(
            self.sentences_b[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Create a dictionary with input_ids and attention_mask for both inputs
        item = {
            'input_ids_a': encoding_a['input_ids'].squeeze(0),  # Resume input
            'attention_mask_a': encoding_a['attention_mask'].squeeze(0),  # Resume attention mask
            'input_ids_b': encoding_b['input_ids'].squeeze(0),  # Job description input
            'attention_mask_b': encoding_b['attention_mask'].squeeze(0),  # Job description attention mask
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)  # Labels (classification)
        }

        return item

In [23]:
processed_df.head()

Unnamed: 0,resume_text,job_description_text,label,combined_text
0,summaryhighly motivated sales associate extens...,netsource inc award winning total workforce so...,1,summaryhighly motivated sales associate extens...
1,professional summarycurrently working caterpil...,salas obrien tell clients engineered impact pa...,1,professional summarycurrently working caterpil...
2,summaryi started construction career june jack...,schweitzer engineering laboratories sel infras...,1,summaryi started construction career june jack...
3,summarycertified electrical foremanwith thirte...,mizick miller company inc looking dynamic indi...,1,summarycertified electrical foremanwith thirte...
4,summarywith extensive experience business requ...,life capgemini capgemini supports aspects well...,1,summarywith extensive experience business requ...


In [24]:
from torch.utils.data import DataLoader
import pandas as pd

dataset = SentencePairDataset(processed_df, tokenizer)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)  # Adjust batch size as needed

In [25]:
import torch.nn as nn
import torch

class SBERT(nn.Module):
    def __init__(self, bert_model):
        super(SBERT, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(self.bert.config.hidden_size * 3, 3)  # 3 classes for classification
        self.softmax = nn.Softmax(dim=1)  # Softmax for classification

    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        # Get the BERT outputs for both sentence pairs
        outputs_a = self.bert(input_ids_a, attention_mask=attention_mask_a)
        pooled_output_a = outputs_a.last_hidden_state[:, 0, :]  # CLS token for sentence A

        outputs_b = self.bert(input_ids_b, attention_mask=attention_mask_b)
        pooled_output_b = outputs_b.last_hidden_state[:, 0, :]  # CLS token for sentence B

        # Compute the element-wise absolute difference |u - v|
        abs_diff = torch.abs(pooled_output_a - pooled_output_b)

        # Concatenate u, v, and |u - v|
        combined = torch.cat((pooled_output_a, pooled_output_b, abs_diff), dim=1)

        # Pass through a fully connected layer
        logits = self.fc(combined)

        # Apply softmax to get classification probabilities
        probabilities = self.softmax(logits)

        return probabilities

        # Apply softmax to get classification probabilities
        probabilities = self.softmax(logits)

        return probabilities

In [26]:
import torch.optim as optim
import torch.nn as nn

# Check for available devices and assign the appropriate one
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# Initialize SBERT model and send it to the selected device
sbert = SBERT(model).to(device)

# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss()  # CrossEntropyLoss for classification
optimizer = optim.Adam(sbert.parameters(), lr=2e-5)

# Set the accumulation steps (e.g., accumulate gradients over 4 smaller batches)
accumulation_steps = 4

# Training loop
sbert.train()
for epoch in range(3):  # Train for 3 epochs (adjust as needed)
    epoch_loss = 0
    for batch in train_loader:
        # Move input data to the same device as the model (MPS, CUDA, or CPU)
        input_ids_a = batch['input_ids_a'].to(device)  # Resume input
        attention_mask_a = batch['attention_mask_a'].to(device)  # Attention mask for resume
        input_ids_b = batch['input_ids_b'].to(device)  # Job description input
        attention_mask_b = batch['attention_mask_b'].to(device)  # Attention mask for job description
        labels = batch['labels'].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = sbert(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        loss = loss_fn(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

Epoch 1, Loss: 1533.9761
Epoch 2, Loss: 1356.0796
Epoch 3, Loss: 1281.2727
