# LLM Cover Letter Builder Demo

This notebook demonstrates the implementation of a Retrieval Augmented Generation (RAG) system for building AI-powered cover letters using DeepSeek Chat API and LangChain.

In [None]:
# Section 1: Setup and Dependencies
import langchain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_deepseek import ChatDeepSeek
from dotenv import load_dotenv
import os
import chromadb

# Load environment variables
load_dotenv()
api_key = os.getenv('DEEPSEEK_API_KEY')

In [None]:
# Section 2: Document Processing
# Load and split text
loader = TextLoader("data/demo_data.txt")
documents = loader.load()

# Configure text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# Split documents into chunks
docs = text_splitter.split_documents(documents)

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create vector database
vector_db = Chroma.from_documents(docs, embedding_model)

In [None]:
# Section 3: Vector Database Operations
def view_stored_documents(limit=5):
    """View documents stored in the vector database."""
    chroma_client = chromadb.PersistentClient(path="vector_db")
    collection = chroma_client.get_or_create_collection(name="documents")
    
    stored_docs = collection.get(limit=limit)
    
    for i, (doc, metadata) in enumerate(zip(stored_docs["documents"], stored_docs["metadatas"])):
        print(f"Chunk {i+1}:")
        print(f"Content: {doc}")
        print(f"Metadata: {metadata}\n")

def search_documents(query_text, n_results=3):
    """Search for relevant documents using semantic similarity."""
    chroma_client = chromadb.PersistentClient(path="vector_db")
    collection = chroma_client.get_or_create_collection(name="documents")
    
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results
    )
    
    for i, (doc, metadata) in enumerate(zip(results["documents"][0], results["metadatas"][0])):
        print(f"Result {i+1}:")
        print(f"Content: {doc}")
        print(f"Metadata: {metadata}\n")

In [None]:
# Section 4: Keyword Extraction
def extract_keywords_llm(text):
    """Extract keywords using LLM."""
    prompt = "Extract keywords related to job description that would help in matching a potential resume to it. Return just a list of comma separated keywords"
    response = llm.invoke(f"{prompt}\n\nJob Description:\n{text}")
    return [kw.strip() for kw in response.content.split(',')]

def extract_keywords_spacy(text):
    """Extract keywords using spaCy."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    keywords = set()
    
    # Extract noun chunks and named entities
    keywords.update(chunk.text for chunk in doc.noun_chunks)
    keywords.update(ent.text for ent in doc.ents)
    
    return list(keywords)

def extract_keywords_yake(text, top_k=20):
    """Extract keywords using YAKE."""
    extractor = KeywordExtractor(lan="en", n=3, top=top_k)
    keywords = extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

def extract_keywords_tfidf(text, top_k=20):
    """Extract keywords using TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words="english", max_features=top_k)
    X = vectorizer.fit_transform([text])
    return vectorizer.get_feature_names_out()

def extract_keywords_bert(text, top_n=20):
    """Extract keywords using KeyBERT."""
    kw_model = KeyBERT()
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1,3),
        stop_words="english",
        top_n=top_n
    )
    return [kw[0] for kw in keywords]

In [None]:
# Section 5: LLM Integration
def generate_response(query, context):
    """Generate a response using the LLM based on the query and context."""
    prompt = f"Answer using this context:\n{context}\n\nQuestion: {query}"
    response = llm.invoke(prompt)
    return response.content

# Initialize LLM
llm = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0.7,
    max_tokens=150,
    timeout=30,
    max_retries=2
)

In [None]:
# Section 6: Example Usage
def generate_cover_letter(job_description, resume_keywords):
    """Generate a cover letter using the job description and resume keywords."""
    # Extract keywords from job description
    jd_keywords = extract_keywords_llm(job_description)
    
    # Combine keywords for document search
    search_query = " ".join(jd_keywords + resume_keywords)
    
    # Retrieve relevant documents
    retrieved_docs = vector_db.similarity_search(search_query, k=3)
    context = "\n".join([doc.page_content for doc in retrieved_docs])
    
    # Generate cover letter
    prompt = f"""Based on the following job description and candidate's experience, generate a compelling cover letter:

Job Description:
{job_description}

Candidate's Experience:
{context}

Please write a professional cover letter that highlights the candidate's relevant experience and skills."""
    
    response = llm.invoke(prompt)
    return response.content

# Example usage
job_description = """We are looking for a Machine Learning Engineer with experience in Python, TensorFlow, 
and cloud computing (AWS/GCP). The ideal candidate should have a strong background in deep learning and 
natural language processing (NLP)."""

resume_keywords = ["machine learning", "NLP", "Python", "TensorFlow", "AWS"]
cover_letter = generate_cover_letter(job_description, resume_keywords)
print(cover_letter)