In [21]:
import os
import fitz  # PyMuPDF for PDF handling
import spacy
import certifi

# Set the SSL_CERT_FILE environment variable for SSL certificate verification
os.environ['SSL_CERT_FILE'] = certifi.where()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def preprocess_text(text):
    # Remove extra spaces and newlines
    return " ".join(text.split())

# Directory containing the resume files
resumes_directory = '/Users/utsavsharma/Desktop/RR/Resumes'

# List to store the contents of each resume
resumes = []

# Iterate through each file in the directory
for filename in os.listdir(resumes_directory):
    file_path = os.path.join(resumes_directory, filename)
    if filename.endswith('.pdf'):
        resumes.append(preprocess_text(read_pdf(file_path)))

In [22]:
import spacy

# Load spaCy NER model
nlp = spacy.load('en_core_web_sm')

def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

# Extract entities from each resume
for resume in resumes:
    entities = extract_entities(resume)
    print(f"Entities in resume: {entities}")

Entities in resume: [('Alexander Kim kim.a@ufl.edu 321-594-8873', 'PERSON'), ('4.00', 'CARDINAL'), ('Honors Program', 'ORG'), ('Univeristy Research Scholars Program', 'ORG'), ('June 2020 - December 2023', 'DATE'), ('weekly', 'DATE'), ('Dream Team Engineering University', 'ORG'), ('2023 - Present • Collaborate', 'DATE'), ('UF Health Shands Hospital', 'ORG'), ('• Design', 'PRODUCT'), ('Technology Student Association Celebration High School • President', 'ORG'), ('2019 - May 2023', 'DATE'), ('FL', 'ORG'), ('• Led', 'PERSON'), ('three', 'CARDINAL'), ('Society of Robotic Surgery', 'ORG'), ('May 2020 - May 2022', 'DATE'), ('2020', 'DATE'), ('• Awarded', 'ORG'), ('2021', 'DATE'), ('TabbedIn Dream', 'ORG'), ('2023 DTE Prescription', 'MONEY'), ('Jigsaw Jutsu Technology Student Association • https://simmer.io/@bob011/tsa2023videogame-team2725-1 • February 2023', 'ORG'), ('the Florida State TSA Conference Video Game Design', 'ORG'), ('• Served', 'PERSON'), ('SKILLS Java Python', 'PERSON'), ('C++ 

In [23]:
import pandas as pd

# Define a function to extract and organize entities
def extract_resume_info(text):
    doc = nlp(text)
    info = {
        "Name": [],
        "Email": [],
        "Education": [],
        "Skills": [],
        "Experience": []
    }
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            info["Name"].append(ent.text)
        elif ent.label_ == "EMAIL":
            info["Email"].append(ent.text)
        elif ent.label_ in ["ORG", "GPE"]:
            info["Education"].append(ent.text)
        elif ent.label_ == "SKILL":  # Assuming you have custom skills entities
            info["Skills"].append(ent.text)
        elif ent.label_ == "WORK_OF_ART":  # This is just an example, adjust as necessary
            info["Experience"].append(ent.text)
    return info

# Create a DataFrame to store all resume information
resume_data = []

for resume in resumes:
    resume_info = extract_resume_info(resume)
    resume_data.append(resume_info)

df = pd.DataFrame(resume_data)

# Save the DataFrame to a CSV file
csv_file_path = '/Users/utsavsharma/Desktop/RR/resume_data.csv'
df.to_csv(csv_file_path, index=False)

print(f"Resume data has been saved to {csv_file_path}")


Resume data has been saved to /Users/utsavsharma/Desktop/RR/resume_data.csv


In [24]:
# Path to your CSV file
csv_file_path = '/Users/utsavsharma/Desktop/RR/resume_data.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

In [25]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Define a list of common skill keywords (you can expand this list)
common_skills = [
    'Python', 'Java', 'C++', 'SQL', 'Machine Learning', 'Data Science', 
    'Deep Learning', 'Natural Language Processing', 'NLP', 'Statistics', 
    'Data Analysis', 'TensorFlow', 'PyTorch', 'Keras', 'Excel', 'R', 'Git'
]

def extract_skills(text):
    doc = nlp(text)
    skills = []
    for token in doc:
        if token.text in common_skills:
            skills.append(token.text)
    return skills

# Apply the skill extraction function to each row in the DataFrame
df['Skills_Extracted'] = df['Skills'].apply(lambda x: extract_skills(str(x)))

In [26]:
# Aggregate skills
all_skills = df['Skills_Extracted'].explode().value_counts()

# Convert to a DataFrame for neat presentation
skills_df = pd.DataFrame(all_skills).reset_index()
skills_df.columns = ['Skill', 'Count']

# Display the DataFrame
print(skills_df)

Empty DataFrame
Columns: [Skill, Count]
Index: []
