In [17]:
import os
import zipfile
import pandas as pd
import pdfplumber
from docx import Document
import re
import nltk
from nltk.corpus import stopwords


In [18]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
ZIP_PATH = "data/Resumes.zip"
EXTRACT_PATH = "data/resumes"



In [19]:
os.makedirs(EXTRACT_PATH, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall(EXTRACT_PATH)

print("Resumes extracted successfully")


Resumes extracted successfully


In [20]:
def read_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text


def read_docx(file_path):
    doc = Document(file_path)
    return " ".join(p.text for p in doc.paragraphs)


In [21]:
data = []
base_dir = os.path.join(EXTRACT_PATH, "Resumes")

for category in os.listdir(base_dir):
    category_path = os.path.join(base_dir, category)

    if os.path.isdir(category_path):
        for file in os.listdir(category_path):
            file_path = os.path.join(category_path, file)

            if file.lower().endswith(".pdf"):
                text = read_pdf(file_path)
            elif file.lower().endswith(".docx"):
                text = read_docx(file_path)
            else:
                continue

            if text and text.strip():
                data.append({
                    "text": text,
                    "category": category
                })

df = pd.DataFrame(data)
print("Total resumes:", len(df))
df.head()


Total resumes: 79


Unnamed: 0,text,category
0,3.3 years of IT experience as Workday HCM T...,workday
1,Seeking suitable positions in Workday HCM as ...,workday
2,Name\t: Naresh Babu Cherukuri Objective: To t...,workday
3,HIMA.MENDU Career Objective To continue gr...,workday
4,Hari Krishna M Summary: A result oriented pr...,workday


In [22]:
df.to_csv("data/resume_raw_dataset.csv", index=False)
print("Raw dataset saved")


Raw dataset saved


In [23]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'\b\d{10,12}\b', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 2]

    return " ".join(words)


In [24]:
df["clean_text"] = df["text"].apply(clean_text)
df["word_count"] = df["clean_text"].apply(lambda x: len(x.split()))
df.head()


Unnamed: 0,text,category,clean_text,word_count
0,3.3 years of IT experience as Workday HCM T...,workday,years experience workday hcm technical consult...,422
1,Seeking suitable positions in Workday HCM as ...,workday,seeking suitable positions workday hcm techno ...,630
2,Name\t: Naresh Babu Cherukuri Objective: To t...,workday,name naresh babu cherukuri objective take resp...,631
3,HIMA.MENDU Career Objective To continue gr...,workday,hima mendu career objective continue growing k...,347
4,Hari Krishna M Summary: A result oriented pr...,workday,hari krishna summary result oriented professio...,773


In [25]:
df = df[df["word_count"] >= 50].reset_index(drop=True)
df.shape


(78, 4)

In [26]:
df.to_csv("data/resume_cleaned_dataset.csv", index=False)
print("Cleaned dataset saved")


Cleaned dataset saved
