In [1]:
import os
import csv
import re
import io

from groq import Groq

def extract_text_from_docx(docx_path):
    """
    Extracts text from a .docx file.
    """
    try:
        import docx
        doc = docx.Document(docx_path)
        text = []
        for para in doc.paragraphs:
            text.append(para.text)
        return '\n'.join(text)
    except Exception as e:
        raise ValueError(f"Failed to extract text from {docx_path}: {e}")

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a .pdf file.
    """
    try:
        from PyPDF2 import PdfReader
        pdf_text = []
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            for page in reader.pages:
                pdf_text.append(page.extract_text())
        return '\n'.join(pdf_text)
    except Exception as e:
        raise ValueError(f"Failed to extract text from {pdf_path}: {e}")

def extract_text_from_resume(file_path):
    """
    Extracts text from a resume file (.docx or .pdf).
    """
    if file_path.endswith('.docx'):
        return extract_text_from_docx(file_path)
    elif file_path.endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

def extract_text_from_resumes_in_folder(folder_path):
    """
    Extracts text from all resume files (.docx or .pdf) in a folder.
    """
    resumes_text = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.docx') or filename.endswith('.pdf'):
            try:
                text = extract_text_from_resume(file_path)
                resumes_text[filename] = text
            except Exception as e:
                print(f"Failed to extract text from {filename}: {e}")
    return resumes_text

# Example usage
folder_path = 'data'
resumes_text = extract_text_from_resumes_in_folder(folder_path)
all_rows = []

for filename, text in resumes_text.items():
    print(f"--- {filename} ---")
    # Define a template for the prompt
    prompt_template = f'''
        You are an AI bot designed to act as a professional for parsing resumes.
        You are given with resume and your job is to extract the following information from the resume just that dont give additional text in the begining and end just this info:
        1. full name
        2. email id
        3. github portfolio
        4. linkedin id
        5. employment details
        6. technical skills
        7. soft skills
        Give the extracted information in csv format only
        and this is resume{text} and dont add additional text in begining and end just extract csv and give complete information i dont want such line also
        Here is the extracted information in CSV format:
        '''
    
    client = Groq(
        api_key="api key",
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt_template,
            }
        ],
        temperature=0.4,
        model="llama3-70b-8192",
    )

    response_content = chat_completion.choices[0].message.content
    print(response_content)
    # Use io.StringIO to treat the string as a file-like object
    data_io = io.StringIO(response_content.strip())

    # Read the data using csv.DictReader
    reader = csv.DictReader(data_io, delimiter=',')
    rows = list(reader)

    # Extend the list of all rows with current rows
    all_rows.extend(rows)

# Specify the path where you want to save the combined CSV file
csv_file = 'combined_employee_data.csv'

# Write all accumulated data to the CSV file using csv.DictWriter
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ["Full Name", "Email ID", "Github Portfolio", "LinkedIn ID", "Employment Details", "Technical Skills", "Soft Skills"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerows(all_rows)

print(f"Combined CSV file '{csv_file}' has been created successfully.")
    

--- Charlotte Donald.docx ---
"Full Name","Email ID","Github Portfolio","LinkedIn ID","Employment Details","Technical Skills","Soft Skills"
"Charlotte Donald","CDonald@uk.ey.com","","","","Audit, IFRS 17, SOX, PCAOB, Financial Reporting, Insurance, Financial Services, Risk Management, Project Management, Client Money Audit, CASS 5, Financial Analysis, Data Analytics","Communication, Leadership, Team Management, Time Management, Problem Solving, Client Relationship Management, Presentation, Event Organisation"
--- Umang Purwar RESUME-de.pdf ---
"Full Name","Email ID","Github Portfolio","LinkedIn ID","Employment Details","Technical Skills","Soft Skills"
"Umang Purwar","umangpurwar03@gmail.com","","https://www.linkedin.com/","JR. DATA SCIENTIST | INNODATATICS | HYDERABAD, INDIA | DEC /2023 - PRESENT, DATA  SCIENCE INTERN | INNODATATICS | HYDERABAD, INDIA | APRIL /2023 - DEC /2023","MLops, Artificial Intelligence, Machine Learning,  ETL, Computer Vision, Web scrapping, NLP, Large Language 