# Azure OpenAI ChatGPT 3.5 turbo CV Data Extractor
Extract CV Reviewer Data and Export to Excel

In [1]:
import os

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os

# Define the path to the "data" folder
data_folder = "data"

chatgpt_results = []
# Traverse through each subfolder inside the "data" folder
for root, dirs, files in os.walk(data_folder):
    # Iterate over each file in the current subfolder
    for file in files:
        if file == "chatgpt_result.md":
            # Print the file path
            chatgpt_results.append(os.path.join(root, file))
chatgpt_results.sort()


In [3]:
from langchain_core.utils.function_calling import convert_to_openai_function
from typing import List, Optional
from langchain.pydantic_v1 import BaseModel, Field


class StudentCvRecord(BaseModel):
    """Call this to save a student CV record in markdown format."""
    name: str = Field(description="Name of the student")
    email: Optional[str] = Field(description="Email address")
    mobile_number: Optional[str] = Field(description="Contact number")
    linkedin_profile_url: str = Field(description="LinkedIn profile url")
    resume_rating: int = Field(
        description="Rating of the resume between 1 to 10")
    rationale: str = Field(description="Rationale for the rating")
    warning: str = Field(description="Any warning message")
    feedback: str = Field(description="Feedback message")
    proposed_job_titles: List[str] = Field(description="Proposed job titles")
    certifications: List[str] = Field(description="List of certifications")
    technologies: List[str] = Field(description="List of technologies")
    skills: List[str] = Field(description="List of skills")
    work_experience: List[str] = Field(description="List of work experiences")


student_cv_record_function = convert_to_openai_function(StudentCvRecord)

In [4]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers.openai_functions import PydanticOutputFunctionsParser

llm35 = AzureChatOpenAI(
    openai_api_version=os.getenv("AZURE_OPENAI_GPT35_API_VERSION"),
    azure_deployment=os.getenv("AZURE_OPENAI_GPT35_DEPLOYMENT_NAME"),
    temperature=0,
)
model_with_forced_function35 = llm35.bind(
    functions=[student_cv_record_function], function_call={"name": "StudentCvRecord"})

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a data extraction assistant, if you unsure the value, set the value to blank!"),
    ("user",
     "{cv}\nSave a student CV record.")
])

parser = PydanticOutputFunctionsParser(pydantic_schema=StudentCvRecord)

chain35 = prompt | model_with_forced_function35 | parser

llm4o = AzureChatOpenAI(
    openai_api_version=os.getenv("AZURE_OPENAI_GPT4O_API_VERSION"),
    azure_deployment=os.getenv("AZURE_OPENAI_GPT4O_DEPLOYMENT_NAME"),
    temperature=0,
)
model_with_forced_function4o = llm4o.bind(
    functions=[student_cv_record_function], function_call={"name": "StudentCvRecord"})
chain4o = prompt | model_with_forced_function4o | parser


In [5]:
import json
from tqdm import tqdm

student_records = []

for result_path in tqdm(chatgpt_results):
    result_path_json = result_path.replace(".md", ".json")
    if os.path.exists(result_path_json):
        with open(result_path_json, "r") as f:
            result_json = f.read()
        result = StudentCvRecord.parse_raw(result_json)
        student_records.append(result)
        continue
    with open(result_path, "r") as f:
        cv = f.read()
    name = result_path.split("/")[-2]
    try:        
        result = chain35.invoke({"cv": cv})
    except Exception as e:
        result = chain4o.invoke({"cv": cv})        
    
    result.name = name
   
    result_json = json.dumps(result.dict())
    with open(result_path_json, "w") as f:
        f.write(result_json)
    student_records.append(result)



100%|██████████| 82/82 [00:00<00:00, 10075.37it/s]


In [6]:
import pandas as pd

df = pd.DataFrame([record.dict() for record in student_records])
df.set_index('name', inplace=True)
df.to_excel('data/resumes_reviews.xlsx', index=True)
