<a href="https://colab.research.google.com/github/t-mohin/t-mohin/blob/main/Resume_Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf
!pip install textract
!apt-get install unrtf
!pip install --upgrade pdfminer.six

In [None]:
!pip install pymupdf


In [None]:
!pip install PyPDF2
!pip install pdfplumber pdfminer docx2txt textract


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!apt-get install -y antiword

In [None]:
import os
import re
import pdfplumber
import docx2txt
import textract
import glob
import csv
from concurrent.futures import ProcessPoolExecutor, as_completed

# Precompiled regex patterns
email_pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
phone_pattern = re.compile(r'(\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})')
name_pattern = re.compile(r'(?i)(?:name|^)[\s:]*([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)')
location_pattern = re.compile(r'\b(?:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming)\b', re.IGNORECASE)
job_title_pattern = re.compile(r'\b(?:Software Engineer|Data Scientist|Project Manager|Web Developer|Product Manager|Business Analyst|Graphic Designer|Consultant|Sales Manager|Marketing Manager|Quality Assurance|Senior Software Developer|IT|DevOps Engineer|Systems Administrator|Network Engineer|Database Administrator|Security Analyst|Data Analyst|Machine Learning Engineer|Cloud Engineer|Full Stack Developer|Frontend Developer|Backend Developer|Mobile Developer|UI/UX Designer|Scrum Master|Technical Support Specialist|IT Manager|CTO|CIO|Technical Lead|Engineering Manager|Non-Technical Role|Customer Service Representative|Retail Sales Associate|Administrative Assistant|Human Resources Specialist|Accountant|Financial Analyst|Executive Assistant)\b', re.IGNORECASE)

def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    text = ''
    try:
        if ext == '.pdf':
            with pdfplumber.open(file_path) as pdf:
                text = ''.join(page.extract_text() for page in pdf.pages)
        elif ext == '.docx':
            text = docx2txt.process(file_path)
        elif ext == '.doc':
            text = textract.process(file_path).decode('utf-8')
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
    return text

def extract_info(text, file_name):
    emails = email_pattern.findall(text)
    phones = phone_pattern.findall(text)
    names = name_pattern.findall(text)
    locations = location_pattern.findall(text)
    job_titles = job_title_pattern.findall(text)

    # Handle special cases for names
    if not names or names[0].strip().lower() in ('resume of', 'resume updated', 'professional summary'):
        name = file_name
    else:
        name = names[0].strip()

    # If the detected name is a job title, iterate through text lines to find a valid name
    if re.match(job_title_pattern, name):
        for line in text.split('\n'):
            line = line.strip()
            if re.match(name_pattern, line) and not re.match(job_title_pattern, line):
                name = line.strip()
                break

    email = emails[0] if emails else ''
    phone = phones[0] if phones else ''
    location = ' '.join(locations[:5]) if locations else ''
    job_title = job_titles[0] if job_titles else 'Non-Technical Role'

    return {
        'Name': name,
        'Email': email,
        'Phone': phone,
        'Location': location,
        'Job Title': job_title
    }

def process_file(file_path):
    text = extract_text(file_path)
    if text:
        file_name = os.path.splitext(os.path.basename(file_path))[0].replace('Copy of ', '').split()[0].strip()
        return extract_info(text, file_name)
    return None

def main():
    resumes_dir = '/content/drive/MyDrive/Resumes'
    output_csv = '/content/drive/MyDrive/Extracted_Resume_Info.csv'

    files = glob.glob(os.path.join(resumes_dir, '*'))

    # Using ProcessPoolExecutor for parallel processing
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(process_file, file): file for file in files}

    extracted_info = []
    for future in as_completed(futures):
        result = future.result()
        if result:
            extracted_info.append(result)

    # Writing results to CSV
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Name', 'Email', 'Phone', 'Location', 'Job Title'])
        writer.writeheader()
        for info in extracted_info:
            writer.writerow(info)

    print(f"Extraction complete. Information saved to {output_csv}")

if __name__ == "__main__":
    main()


In [None]:
from google.colab import files

# Define the path to the CSV file
output_csv = '/content/drive/MyDrive/Extracted_Resume_Info.csv'

# Download the CSV file
files.download(output_csv)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>