In [9]:
from langchain_openai import OpenAI
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA 
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
llm = OpenAI(temperature=0.6, max_tokens=500)

folder_path = 'D:\\Data Science\\Generative_AI\\Resume reader'
loader = DirectoryLoader(folder_path, glob='*.txt', loader_cls=TextLoader)
documents = loader.load()

embeddings = HuggingFaceInstructEmbeddings()

load INSTRUCTOR_Transformer
max_seq_length  512


In [10]:
vectordb = FAISS.from_documents(
    documents=documents,
    embedding=embeddings
)

retriever = vectordb.as_retriever()

In [12]:
prompt_template = """Given the following context and a question, generate an answer using minimum number of words based on this context only.
    In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
    If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.

    CONTEXT: {context}

    QUESTION: {question}"""

PROMPT = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question"]
    )

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    input_key="query",
    return_source_documents=False,
    chain_type_kwargs={"prompt": PROMPT})

result = chain("Mention the required academic qualification")
print("Answer: ",result)

Answer:  {'query': 'Mention the required academic qualification', 'result': " for Senior Data Scientist\n\nMaster's Degree in Computer Science, Statistics, Math, Operations Research, Economics, or a related field."}


In [24]:
import pandas as pd
from dotenv import load_dotenv
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS

load_dotenv()

llm = OpenAI(temperature=0.6, max_tokens=500)

folder_path = 'D:\\Data Science\\Generative_AI\\Resume reader'
loader = DirectoryLoader(folder_path, glob='*.txt', loader_cls=TextLoader)
documents = loader.load()

embeddings = HuggingFaceInstructEmbeddings()

prompt_template = """Given the following context and a question, generate an answer using minimum number of words based on this context only.
    In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
    If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.

    CONTEXT: {context}

    QUESTION: {question}"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

questions = [
    "Mention the job title using less than four words.",
    "Mention the number of experience years needed for positions using less than six words.",
    "Mention the academic qualification using less than ten words."
]


all_results = []

for doc in documents:
    vectordb = FAISS.from_documents(
        documents=[doc],
        embedding=embeddings
    )

    retriever = vectordb.as_retriever()

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        input_key="query",
        return_source_documents=False,
        chain_type_kwargs={"prompt": PROMPT}
    )

    # Dictionary to store the results for this document
    results = {
        "Job title": [],
        "Experienced years": [],
        "Academic Qualification": []
    }

    # Query each question and store the results
    for question in questions:
        result = chain({"query": question})
        if question.startswith("Mention the job title"):
            results["Job title"].append(result['result'])
        elif question.startswith("Mention the number of experience years"):
            results["Experienced years"].append(result['result'])
        elif question.startswith("Mention the academic qualification"):
            results["Academic Qualification"].append(result['result'])

    # Add the results for this document to the list of all results
    all_results.append(results)

# Convert all results to a DataFrame
data = {
    "File": [],
    "Job title": [],
    "Experienced years": [],
    "Academic Qualification": []
}

for i, result in enumerate(all_results):
    data["File"].append(f'JD_{i+1}.txt')
    data["Job title"].append(result["Job title"])
    data["Experienced years"].append(result["Experienced years"])
    data["Academic Qualification"].append(result["Academic Qualification"])

df = pd.DataFrame(data)

# Print the DataFrame
print(df)

print("\nAcademic Qualification:")
for qualification in df["Academic Qualification"]:
    print(qualification)


load INSTRUCTOR_Transformer
max_seq_length  512
       File                             Job title        Experienced years  \
0  JD_1.txt                         [\n\nAnalyst]         [\n\n1-2 years.]   
1  JD_2.txt           [\n\nSenior Data Scientist]  [\n\nAt least 2 years.]   
2  JD_3.txt  [\n\nBusiness Development Associate]      [\n\nI don't know.]   
3  JD_4.txt                  [\n\nData Scientist]  [\n\nAt least 2 years.]   
4  JD_5.txt                 [\n\nProcess Analyst]          [\n\n0-3 years]   

                              Academic Qualification  
0  [\n\nGraduate in science/business/accounting o...  
1  [\n\nMaster's Degree in Computer Science, Stat...  
2         [\nBusiness, Marketing, or related field.]  
3  [\n\nMaster's Degree in Computer Science, Stat...  
4  [\n\nAny Graduate (Arts/science/commerce/busin...  

Academic Qualification:
['\n\nGraduate in science/business/accounting or CIMA/CA.']
["\n\nMaster's Degree in Computer Science, Statistics, Math, or rela

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS

load_dotenv()

llm = OpenAI(temperature=0, max_tokens=500)

folder_path = 'D:\\Data Science\\Generative_AI\\Resume reader'
loader = DirectoryLoader(folder_path, glob='*.pdf', loader_cls=PyMuPDFLoader)
documents = loader.load()

# Print the loaded documents for debugging
print(f"Number of documents loaded: {len(documents)}")
for doc in documents:
    print(f"Document metadata: {doc.metadata}")

embeddings = HuggingFaceInstructEmbeddings()

prompt_template = """
Given the following context and a question, generate an answer using minimum number of words based on this context only.
In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.

CONTEXT: {context}

QUESTION: {question}"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

questions = [
    "Mention the job title using less than four words.",
    "Mention the number of experience years needed for positions using less than six words.",
    "Mention the academic qualification using less than ten words."
]

all_results = []

# Use a set to keep track of processed file names
processed_files = set()

for doc in documents:
    file_name_with_path = doc.metadata["source"]
    file_name = os.path.basename(file_name_with_path)
    if file_name in processed_files:
        continue  # Skip if already processed
    processed_files.add(file_name)
    
    vectordb = FAISS.from_documents(
        documents=[doc],
        embedding=embeddings
    )

    retriever = vectordb.as_retriever()

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        input_key="query",
        return_source_documents=False,
        chain_type_kwargs={"prompt": PROMPT}
    )

    # Dictionary to store the results for this document
    results = {
        "Job title": [],
        "Experienced years": [],
        "Academic Qualification": []
    }

    # Query each question and store the results
    for question in questions:
        result = chain({"query": question})
        if question.startswith("Mention the job title"):
            results["Job title"].append(result['result'])
        elif question.startswith("Mention the number of experience years"):
            results["Experienced years"].append(result['result'])
        elif question.startswith("Mention the academic qualification"):
            results["Academic Qualification"].append(result['result'])

    # Add the results for this document to the list of all results
    results["File"] = file_name  # Store the file name without the path
    all_results.append(results)

# Convert all results to a DataFrame
data = {
    "File": [],
    "Job title": [],
    "Experienced years": [],
    "Academic Qualification": []
}

for result in all_results:
    data["File"].append(result["File"])
    data["Job title"].append(result["Job title"])
    data["Experienced years"].append(result["Experienced years"])
    data["Academic Qualification"].append(result["Academic Qualification"])

df = pd.DataFrame(data)

print(df)

print("\nAcademic Qualification:")
for qualification in df["Academic Qualification"]:
    print(qualification)


Number of documents loaded: 5
Document metadata: {'source': 'D:\\Data Science\\Generative_AI\\Resume reader\\Anthony Davies_Resume.pdf', 'file_path': 'D:\\Data Science\\Generative_AI\\Resume reader\\Anthony Davies_Resume.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Turbo', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20240302172612+05'30'", 'modDate': "D:20240302172612+05'30'", 'trapped': ''}
Document metadata: {'source': 'D:\\Data Science\\Generative_AI\\Resume reader\\Christoper Morgan_Resume.pdf', 'file_path': 'D:\\Data Science\\Generative_AI\\Resume reader\\Christoper Morgan_Resume.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Turbo', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20240302170323+05'30'", 'modDate': 