### 1. Extracting text from PDF

In [None]:
import os
import PyPDF2

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF using PyPDF2."""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = " ".join([page.extract_text().replace("\n", " ") for page in reader.pages if page.extract_text()]) # Replace \n with space
    return text

def load_dataset(base_dir):
    """Load PDF files and extract text from each category."""
    data = {}

    for category in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, category)
        if os.path.isdir(folder_path):
            data[category] = []
            for file in os.listdir(folder_path):
                if file.endswith(".pdf"):
                    pdf_path = os.path.join(folder_path, file)
                    text = extract_text_from_pdf(pdf_path)
                    data[category].append(text)
            print(f"Loaded {len(data[category])} files from category '{category}'")

    return data

# Load train dataset
dataset = load_dataset("dataset/data")


In [None]:
# Print a sample extracted text from ACCOUNTANT category
print(dataset["ACCOUNTANT"][0])

### 2. Convert data into Pandas dataframe

In [14]:
# Convert into dataframe
import pandas as pd
def convert_to_dataframe(dataset):
    """Convert the dataset dictionary to a pandas DataFrame."""
    records = []
    for category, texts in dataset.items():
        for text in texts:
            records.append({"Category": category, "Resume_Text": text})
    return pd.DataFrame(records)

# Convert dataset to DataFrame
data_df = convert_to_dataframe(dataset)

# Preview
print(data_df)

        Category                                        Resume_Text
0     ACCOUNTANT  ACCOUNTANT Summary Financial Accountant specia...
1     ACCOUNTANT  STAFF ACCOUNTANT Summary Highly analytical and...
2     ACCOUNTANT  ACCOUNTANT Professional Summary To obtain a po...
3     ACCOUNTANT  SENIOR ACCOUNTANT Experience Company Name   Ju...
4     ACCOUNTANT  SENIOR ACCOUNTANT Professional Summary Senior ...
...          ...                                                ...
2479     TEACHER  READING TEACHER Summary I am a highly motivate...
2480     TEACHER  HISTORY TEACHER Professional Summary To be emp...
2481     TEACHER  TEACHER Summary Highly ethical, dependable, an...
2482     TEACHER  TEACHER Summary Talented early education profe...
2483     TEACHER  Kpandipou Koffi Summary Compassionate teaching...

[2484 rows x 2 columns]


### 3. Data Preprocessing

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Clean, preprocess, and lemmatize text."""
    if pd.isna(text):  # Handle missing values
        return ""

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = re.sub(r"\W+", " ", text)  # Remove special characters
    text = re.sub(r"\d+", " ", text)  # Remove numbers
    
    # Convert to lowercase and strip spaces
    text = text.lower().strip()
    
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return " ".join(lemmatized_tokens)

# Apply text cleaning
data_df["Cleaned_Text"] = data_df["Resume_Text"].apply(clean_text)

print(data_df['Cleaned_Text'][0])  # Verify cleaned text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shany\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shany\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shany\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


accountant summary financial accountant specializing financial planning reporting analysis within department defense highlight account reconciliation result oriented financial reporting critical thinking accounting operation professional analysis financial system erp enterprise resource planning software excellent facilitator accomplishment served tiger team identified resolved general ledger posting deams totaling b accounting adjustment allowed first successful fiscal year end close collaboration dfas europe developed automated tool identified duplicate obligation tool allowed hq usafe deobligate duplicate obligation experience company name july november accountant city state enterprise resource planning office ero position accountant assigned defense enterprise accounting management system deams ero responsible identifying resolving issue affecting deams general ledger worked teammate procure pay order cash budget report area resolve daily challenge encountered deployment deams addi

### 4. Building LSTM model

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

In [18]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42, stratify=data_df['Category'])

X_train = train_df['Cleaned_Text'].values
y_train = train_df['Category'].values
X_test = test_df['Cleaned_Text'].values
y_test = test_df['Category'].values

In [24]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1987,), (1987,), (497,), (497,))

In [21]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Get label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Category Label Mapping:", label_mapping)

Category Label Mapping: {'ACCOUNTANT': np.int64(0), 'ADVOCATE': np.int64(1), 'AGRICULTURE': np.int64(2), 'APPAREL': np.int64(3), 'ARTS': np.int64(4), 'AUTOMOBILE': np.int64(5), 'AVIATION': np.int64(6), 'BANKING': np.int64(7), 'BPO': np.int64(8), 'BUSINESS-DEVELOPMENT': np.int64(9), 'CHEF': np.int64(10), 'CONSTRUCTION': np.int64(11), 'CONSULTANT': np.int64(12), 'DESIGNER': np.int64(13), 'DIGITAL-MEDIA': np.int64(14), 'ENGINEERING': np.int64(15), 'FINANCE': np.int64(16), 'FITNESS': np.int64(17), 'HEALTHCARE': np.int64(18), 'HR': np.int64(19), 'INFORMATION-TECHNOLOGY': np.int64(20), 'PUBLIC-RELATIONS': np.int64(21), 'SALES': np.int64(22), 'TEACHER': np.int64(23)}


In [38]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary size
tokenizer.fit_on_texts(X_train)

# Convert text into sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure equal length
max_len = max(len(seq) for seq in X_train_seq)  # Find max sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

print("Shape of Training Data:", X_train_pad.shape)
print("Shape of Testing Data:", X_test_pad.shape)

Shape of Training Data: (1987, 3309)
Shape of Testing Data: (497, 3309)


In [40]:
class ResumeDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create PyTorch datasets
train_dataset = ResumeDataset(X_train_pad, y_train_encoded)
test_dataset = ResumeDataset(X_test_pad, y_test_encoded)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [44]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size=5000, embedding_dim=128, hidden_dim=64, output_dim=24, dropout_prob=0.2):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru1 = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim // 2, batch_first=True)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_dim // 2, output_dim)
    
    def forward(self, x):
        x = self.embedding(x.long())
        x = self.dropout1(x)
        gru_out1, _ = self.gru1(x)
        gru_out1 = self.dropout1(gru_out1)
        gru_out2, _ = self.gru2(gru_out1)
        out = self.dropout2(gru_out2[:, -1, :])
        return self.fc(out)

# Instantiate model
model = GRUClassifier()
print(model)

GRUClassifier(
  (embedding): Embedding(5000, 128)
  (gru1): GRU(128, 64, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (gru2): GRU(64, 32, batch_first=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=32, out_features=24, bias=True)
)


In [None]:
# class LSTMClassifier(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_prob=0.2):
#         super(LSTMClassifier, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.dropout1 = nn.Dropout(dropout_prob)
#         self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2, batch_first=True)
#         self.dropout2 = nn.Dropout(dropout_prob)
#         self.fc = nn.Linear(hidden_dim // 2, output_dim)
    
#     def forward(self, x):
#         x = self.embedding(x.long())
#         x = self.dropout1(x)
#         lstm_out1, _ = self.lstm1(x)
#         lstm_out1 = self.dropout1(lstm_out1)
#         lstm_out2, _ = self.lstm2(lstm_out1)
#         out = self.dropout2(lstm_out2[:, -1, :])
#         return self.fc(out)

# # Model parameters
# vocab_size = 5000
# embedding_dim = 128
# hidden_dim = 64
# output_dim = len(label_mapping)

# # Instantiate model
# model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
# print(model)

LSTMClassifier(
  (embedding): Embedding(5000, 128)
  (lstm1): LSTM(128, 64, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (lstm2): LSTM(64, 32, batch_first=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=32, out_features=24, bias=True)
)


In [45]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Epoch 1/10, Loss: 200.4157
Epoch 2/10, Loss: 200.3176
Epoch 3/10, Loss: 199.2496
Epoch 4/10, Loss: 199.3876
Epoch 5/10, Loss: 199.5937
Epoch 6/10, Loss: 199.2390
Epoch 7/10, Loss: 199.0421
Epoch 8/10, Loss: 198.5726
Epoch 9/10, Loss: 198.8642
Epoch 10/10, Loss: 198.7186


In [46]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        predicted = torch.argmax(outputs, dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.05


In [47]:
sample_text = ["Experienced financial analyst with expertise in investment banking."]
sample_text = [clean_text(text) for text in sample_text]
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding="post")

sample_tensor = torch.tensor(sample_pad, dtype=torch.float32)

model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    predicted_label = torch.argmax(output, dim=1).item()

print(f"Predicted Category: {label_encoder.inverse_transform([predicted_label])[0]}")

Predicted Category: CHEF


In [48]:
# For clearing parameters
def clear_parameters(model):
    """Clear model parameters to free up memory."""
    model.train(False)  # Set to evaluation mode
    for param in model.parameters():
        param.data.zero_()  # Zero out parameter data
        param.grad = None   # Clear gradients
        param.requires_grad = False  # Disable gradient computation
    return model

clear_parameters(model)

GRUClassifier(
  (embedding): Embedding(5000, 128)
  (gru1): GRU(128, 64, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (gru2): GRU(64, 32, batch_first=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=32, out_features=24, bias=True)
)