In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import torch 
from pathlib import Path
import PyPDF2

In [None]:
path_pdf = 'Resume Dataset/data'

In [None]:
def read_pdfs_to_dataframe(root_folder_path):
    """
    Read all PDF files from folders and create a dataframe with occupation and resume columns.
    
    Args:
        root_folder_path (str): Path to the root folder containing subfolders with PDF files
    
    Returns:
        pandas.DataFrame: DataFrame with columns ['occupation', 'resume', 'file_path']
    """
    pdf_data = []
    root_path = Path(root_folder_path)
    
    # Find all PDF files recursively
    pdf_files = root_path.rglob("*.pdf")
    
    for pdf_file in pdf_files:
        strip_category = str(pdf_file).split('\\')[-2]  # Get the parent folder name as occupation
        # print(strip_category)
        try:
            with open(pdf_file, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                
                # Extract text from all pages
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text() + "\n"
                
                # Split text into lines and filter out empty lines
                lines = [line.strip() for line in text.split('\n') if line.strip()]
                
                
                if lines:
                    occupation = lines[0]  # First row becomes occupation
                    resume = '\n'.join(lines[1:])  # Rest becomes resume
                else:
                    occupation = ""
                    resume = ""
                
                pdf_data.append({
                    'occupation': strip_category,
                    'resume': resume
                })
                
                print(f"Successfully processed: {pdf_file}")
                
        except Exception as e:
            print(f"Error reading {pdf_file}: {str(e)}")
            pdf_data.append({
                'occupation': "",
                'resume': ""
            })
    
    # Create DataFrame
    pdf_df = pd.DataFrame(pdf_data)
    return pdf_df

In [None]:
df = read_pdfs_to_dataframe(path_pdf)

In [None]:
def clean_and_tokenize(text):

    text = re.sub(r'<[^>]+>', ' ', text)               # Remove HTML tags
    text = re.sub(r'[^a-zA-Z ]', ' ', text)            # Remove special chars
    text = text.lower()                                # Lowercase
    tokens = text.split()                              # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

df['tokens'] = df['resume'].apply(clean_and_tokenize)

In [None]:
df.to_csv('Resume Dataset/pdf_to_resume.csv', index=False)