### 1. Extracting text from PDF

In [None]:
import os
import PyPDF2

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF using PyPDF2."""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = " ".join([page.extract_text().replace("\n", " ") for page in reader.pages if page.extract_text()]) # Replace \n with space
    return text

def load_dataset(base_dir):
    """Load PDF files and extract text from each category."""
    data = {}

    for category in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, category)
        if os.path.isdir(folder_path):
            data[category] = []
            for file in os.listdir(folder_path):
                if file.endswith(".pdf"):
                    pdf_path = os.path.join(folder_path, file)
                    text = extract_text_from_pdf(pdf_path)
                    data[category].append(text)

    return data

# Load train dataset
train_data = load_dataset("dataset_small/train")
test_data = load_dataset("dataset_small/test")

# Print a sample extracted text from BANKING category
print(train_data["BANKING"][0])


REGISTERED CLIENT SERVICE ASSOCIATE Summary To obtain a position where my years of experience in the client support environment and proven track record of maintaining and developing new businesses can be fully utilized. Results-oriented, high-energy, hands-on professional, with a successful record of accomplishments in client support.  Major strengths include strong leadership, excellent communication skills, strong team player, attention to detail, compliance in all regulated environment and supervisory skills. Skills Microsoft Word for Windows, Excel, Power Point, Access, Adobe PageMaker 6.5, Adobe Photoshop, Dreamweaver, Visio, Unix, Oracle 9i and Developer 2000. Experience Registered Client Service Associate   06/2012   to  Current   Company Name   City   ,  State Serve as primary contact to investment clients and prospects. Provide all operational services and support for client accounts. Provide Financial Advisors with presentations and seminars, aide in the implementation of mar

### 2. Convert data into Pandas dataframe

In [21]:
import pandas as pd

# Convert extracted resume text into structured format
train_df = pd.DataFrame(train_data).melt(var_name="Category", value_name="Resume_Text")
test_df = pd.DataFrame(test_data).melt(var_name="Category", value_name="Resume_Text")

print(train_df)  # Sample preview

   Category                                        Resume_Text
0   BANKING  REGISTERED CLIENT SERVICE ASSOCIATE Summary To...
1   BANKING  VICE PRESIDENT Summary Seeking a Program/Proje...
2   BANKING  OPERATIONS MANAGER Summary Experienced client ...
3   BANKING  ACCOUNT RECEIVABLE Executive Summary Champion ...
4   BANKING  MORTGAGE BANKING FORECLOSURE SPECIALIST Summar...
5      CHEF  SOUS CHEF Work Experience Sous Chef   Jul 2010...
6      CHEF  FOOD PREP CHEF Skills Highly skilled in cookin...
7      CHEF  GENERAL MANAGER / EXECUTIVE CHEF Summary Dedic...
8      CHEF  MANAGER AND EXECUTIVE CHEF Profile Results ori...
9      CHEF  EXECUTIVE CHEF /CHEF MANAGER Experience Execut...
10  TEACHER  READING TEACHER Summary I am a highly motivate...
11  TEACHER  HISTORY TEACHER Professional Summary To be emp...
12  TEACHER  TEACHER Summary Highly ethical, dependable, an...
13  TEACHER  TEACHER Summary Talented early education profe...
14  TEACHER  Kpandipou Koffi Summary Compassionate teac

### 3. Data Preprocessing

In [27]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Clean, preprocess, and lemmatize text."""
    if pd.isna(text):  # Handle missing values
        return ""

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = re.sub(r"\W+", " ", text)  # Remove special characters
    text = re.sub(r"\d+", " ", text)  # Remove numbers
    
    # Convert to lowercase and strip spaces
    text = text.lower().strip()
    
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return " ".join(lemmatized_tokens)

# Apply text cleaning
train_df["Cleaned_Text"] = train_df["Resume_Text"].apply(clean_text)
test_df["Cleaned_Text"] = test_df["Resume_Text"].apply(clean_text)

print(train_df['Cleaned_Text'][0])  # Verify cleaned text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shany\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shany\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shany\AppData\Roaming\nltk_data...


registered client service associate summary obtain position year experience client support environment proven track record maintaining developing new business fully utilized result oriented high energy hand professional successful record accomplishment client support major strength include strong leadership excellent communication skill strong team player attention detail compliance regulated environment supervisory skill skill microsoft word window excel power point access adobe pagemaker adobe photoshop dreamweaver visio unix oracle developer experience registered client service associate current company name city state serve primary contact investment client prospect provide operational service support client account provide financial advisor presentation seminar aide implementation marketing material prospect referring client line banking division enhance relationship work financial advisor efficiently manage book business increase revenue bank conduct monthly audit make sure clien

### 4. Building LSTM model

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

In [29]:
# Load preprocessed text and labels
X_train, y_train = train_df["Cleaned_Text"], train_df["Category"]
X_test, y_test = test_df["Cleaned_Text"], test_df["Category"]

In [30]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Get label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Category Label Mapping:", label_mapping)

Category Label Mapping: {'BANKING': np.int64(0), 'CHEF': np.int64(1), 'TEACHER': np.int64(2)}


In [31]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary size
tokenizer.fit_on_texts(X_train)

# Convert text into sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure equal length
max_len = max(len(seq) for seq in X_train_seq)  # Find max sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

print("Shape of Training Data:", X_train_pad.shape)
print("Shape of Testing Data:", X_test_pad.shape)

Shape of Training Data: (15, 875)
Shape of Testing Data: (6, 875)


In [None]:
class ResumeDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create PyTorch datasets
train_dataset = ResumeDataset(X_train_pad, y_train_encoded)
test_dataset = ResumeDataset(X_test_pad, y_test_encoded)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [40]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x.long())  # Convert to embedding
        lstm_out, _ = self.lstm(x)
        final_out = lstm_out[:, -1, :]  # Take last timestep output
        return self.fc(final_out)

# Model parameters
vocab_size = 5000
embedding_dim = 128
hidden_dim = 64
output_dim = len(label_mapping)

# Instantiate model
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
print(model)

LSTMClassifier(
  (embedding): Embedding(5000, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)


In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

In [44]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        predicted = torch.argmax(outputs, dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.33


In [46]:
sample_text = ["Experienced financial analyst with expertise in investment banking."]
sample_text = [clean_text(text) for text in sample_text]
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding="post")

sample_tensor = torch.tensor(sample_pad, dtype=torch.float32)

model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    predicted_label = torch.argmax(output, dim=1).item()

print(f"Predicted Category: {label_encoder.inverse_transform([predicted_label])[0]}")

Predicted Category: BANKING
