In [1]:
!python -m pip install pypdf faiss-cpu --quiet
!python -m pip install langchain langchain-core langchain-community langchain-experimental --quiet
!python -m pip install langchain-openai --quiet
!python -m pip install langchain-community langchainhub langchain-chroma langchain langchain-experimental --quiet

In [None]:
# import os
# from typing import List
# from pydantic import BaseModel
# from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
# from langchain.prompts import ChatPromptTemplate
# from langchain.output_parsers import PydanticOutputParser
# from langchain.chains import LLMChain
# from langchain_community.document_loaders import CSVLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_chroma import Chroma
# import ast


# # ======================================================
# # 1. Define Recommendation Model
# # ======================================================
# class Recommendation(BaseModel):
#     courses: List[str]


# parser = PydanticOutputParser(pydantic_object=Recommendation)

# # ======================================================
# # 2. Load CSV and Create Vector DB
# # ======================================================
# file_path = "/home/zadmin/Desktop/GAAI-B4-Azure/datasets/assignment2dataset.csv"
# loader = CSVLoader(file_path=file_path, encoding="utf-8")
# docs = loader.load()

# print(f" Loaded {len(docs)} documents")

# # Split text into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
# splits = text_splitter.split_documents(docs)

# # Embeddings
# embedding_model_name = "text-embedding-3-small"
# embeddings = AzureOpenAIEmbeddings(
#     model=embedding_model_name,
#     azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"]
# )

# # Vector DB
# vector_db_path = "VectorDB_Chroma_asssignment2"
# os.makedirs(vector_db_path, exist_ok=True)

# vectorstore = Chroma.from_documents(
#     documents=splits,
#     embedding=embeddings,
#     persist_directory=vector_db_path,
#     collection_name="Trainings",
#     collection_metadata={"use_type": "TRAINING AND EXPERIMENTATION"}
# )

# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})


# # ======================================================
# # 3. LLM Setup
# # ======================================================
# llm = AzureChatOpenAI(
#     deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
#     model="gpt-4o",
#     temperature=0
# )

# # ======================================================
# # 4. Query Refinement Step
# # ======================================================
# query_refine_prompt = ChatPromptTemplate.from_messages([
#     ("system", "You are a helpful assistant that reformulates user queries "
#                "into concise search queries for retrieving training materials."),
#     ("human", "User question: {question}")
# ])

# query_refine_chain = LLMChain(
#     llm=llm,
#     prompt=query_refine_prompt
# )

# # ======================================================
# # 5. Recommendation Step
# # ======================================================
# recommendation_prompt = ChatPromptTemplate.from_messages([
#     ("system",
#      "You are an AI assistant that recommends exact 5 training courses. "
#      "Use the retrieved context about available trainings to suggest courses. "
#      "Return the results strictly in JSON format."),
#     ("human",
#      "Original Question: {question}\n\n"
#      "Retrieved Context:\n{context}\n\n"
#      "{format_instructions}")
# ])

# recommendation_chain = LLMChain(
#     llm=llm,
#     prompt=recommendation_prompt,
#     output_parser=parser
# )


# # ======================================================
# # 6. Pipeline Function
# # ======================================================
# def run_pipeline(question: str) -> List[str]:
#     # Step 1: Refine query
#     refined_query = query_refine_chain.run({"question": question}).strip()
#     print(f"\n Refined Query: {refined_query}")

#     # Step 2: Retrieve context from vectorstore
#     docs = retriever.get_relevant_documents(refined_query)
#     context = "\n\n".join([doc.page_content for doc in docs])

#     # Step 3: Final recommendation
#     result = recommendation_chain.invoke({
#         "question": question,   # original question
#         "context": context,
#         "format_instructions": parser.get_format_instructions()
#     })

#     return result


# # ======================================================
# # 7. Example Run
# # ======================================================
# if __name__ == "__main__":
#     query = "“I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?”"
#     courses = run_pipeline(query)
#     s=str(courses['text'])
#     print("\n Final Recommended Courses:")
#     # Extract the part inside courses=[...]
#     start = s.find("[")
#     end = s.find("]", start) + 1

#     courses_list = ast.literal_eval(s[start:end])
#     print(courses_list)


 Loaded 25 documents

 Refined Query: Search query: "recommended courses after Python Programming for Data Science focusing on data visualization"

 Final Recommended Courses:
['Data Visualization with Python', 'Advanced Data Visualization Techniques', 'Introduction to Machine Learning', 'Data Analysis with Pandas', 'Statistics for Data Science']


In [2]:
import os
import pandas as pd
from typing import List, Tuple
from pydantic import BaseModel
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.chains import LLMChain
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


# ======================================================
# 1. Define Recommendation Model
# ======================================================
class Recommendation(BaseModel):
    courses: List[str]


parser = PydanticOutputParser(pydantic_object=Recommendation)

# ======================================================
# 2. Load CSV and Create Vector DB
# ======================================================
file_path = "/home/zadmin/Desktop/GAAI-B4-Azure/datasets/assignment2dataset.csv"
loader = CSVLoader(file_path=file_path, encoding="utf-8")
docs = loader.load()

print(f" Loaded {len(docs)} documents")

# Load the CSV with pandas to get course IDs
df = pd.read_csv(file_path)
course_ids = df['course_id'].tolist()
titles = df['title'].tolist()
descriptions = df['description'].tolist()

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

# Embeddings
embedding_model_name = "text-embedding-3-small"
embeddings = AzureOpenAIEmbeddings(
    model=embedding_model_name,
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"]
)

# Vector DB
vector_db_path = "VectorDB_Chroma_asssignment2"
os.makedirs(vector_db_path, exist_ok=True)

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=vector_db_path,
    collection_name="Trainings",
    collection_metadata={"use_type": "TRAINING AND EXPERIMENTATION"}
)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})


# ======================================================
# 3. LLM Setup
# ======================================================
llm = AzureChatOpenAI(
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
    model="gpt-4o",
    temperature=0
)

# ======================================================
# 4. Query Refinement Step
# ======================================================
query_refine_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that reformulates user queries "
               "into concise search queries for course recommendations. "
               "Focus on extracting key skills, technologies, and learning objectives "
               "from the user's profile and interests."),
    ("human", "User profile and interests: {question}")
])

query_refine_chain = LLMChain(
    llm=llm,
    prompt=query_refine_prompt
)

# ======================================================
# 5. Recommendation Step
# ======================================================
recommendation_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are an AI course recommendation assistant. Based on the user's profile and the retrieved course information, "
     "recommend exactly 5 most relevant courses. Consider the user's completed courses, interests, and career goals. "
     "Return only the course IDs (like C001, C002, etc.) in JSON format. Exclude any courses the user has already completed."),
    ("human",
     "User Profile: {question}\n\n"
     "Completed Courses: {completed_courses}\n\n"
     "Available Courses Context:\n{context}\n\n"
     "{format_instructions}")
])

recommendation_chain = LLMChain(
    llm=llm,
    prompt=recommendation_prompt,
    output_parser=parser
)

# ======================================================
# 6. Core Assignment Function - FIXED for actual course IDs
# ======================================================
def recommend_courses(profile: str, completed_ids: List[str]) -> List[Tuple[str, float]]:
    """
    Returns a list of (course_id, similarity_score) for the top-5 recommendations.
    Uses cosine similarity for semantic matching.
    """
    try:
        # Create embedding for the user profile
        profile_embedding = embeddings.embed_query(profile)
        
        # Create embeddings for all course descriptions
        course_texts = [f"{title}: {desc}" for title, desc in zip(titles, descriptions)]
        course_embeddings = embeddings.embed_documents(course_texts)
        
        # Calculate cosine similarities
        similarities = []
        for i, (course_id, course_embedding) in enumerate(zip(course_ids, course_embeddings)):
            if course_id not in completed_ids:
                similarity = cosine_similarity([profile_embedding], [course_embedding])[0][0]
                similarities.append((course_id, float(similarity)))
        
        # Sort by similarity score (descending) and return top 5
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:5]
    
    except Exception as e:
        print(f"Error in recommend_courses: {e}")
        return []

# ======================================================
# 7. Enhanced Pipeline Function
# ======================================================
def run_pipeline(question: str, completed_courses: List[str] = None) -> List[str]:
    if completed_courses is None:
        completed_courses = []
    
    try:
        # Step 1: Refine query
        refined_query = query_refine_chain.run({"question": question}).strip()
        print(f"\n Refined Query: {refined_query}")

        # Step 2: Retrieve context from vectorstore
        docs = retriever.get_relevant_documents(refined_query)
        context = "\n\n".join([doc.page_content for doc in docs])

        # Step 3: Final recommendation
        result = recommendation_chain.invoke({
            "question": question,
            "completed_courses": ", ".join(completed_courses) if completed_courses else "None",
            "context": context,
            "format_instructions": parser.get_format_instructions()
        })

        return result
    except Exception as e:
        print(f"Error in run_pipeline: {e}")
        return []

# ======================================================
# 8. Evaluation Function
# ======================================================
def evaluate_recommendations():
    """Evaluate the recommendation engine with the 5 test profiles"""
    test_profiles = [
        ("I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?", ["C001"]),
        ("I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.", []),
        ("My background is in ML fundamentals; I'd like to specialize in neural networks and production workflows.", ["C001"]),
        ("I want to learn to build and deploy microservices with Kubernetes—what courses fit best?", []),
        ("I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?", [])
    ]
    
    print("=== EVALUATION REPORT ===")
    for i, (profile, completed) in enumerate(test_profiles, 1):
        print(f"\n--- Test Profile {i} ---")
        print(f"Input: {profile}")
        print(f"Completed Courses: {completed}")
        
        # Get vector similarity recommendations
        vector_recs = recommend_courses(profile, completed)
        print("Vector-based Recommendations (Course ID, Similarity Score):")
        for course_id, score in vector_recs:
            # Find course title for better readability
            course_index = course_ids.index(course_id) if course_id in course_ids else -1
            course_title = titles[course_index] if course_index != -1 else "Unknown Course"
            print(f"  {course_id} - {course_title}: {score:.4f}")
        
        # Get LLM-enhanced recommendations
        try:
            llm_recs = run_pipeline(profile, completed)
            s = str(llm_recs['text'])
            start = s.find("[")
            end = s.find("]", start) + 1
            if start != -1 and end != -1:
                courses_list = ast.literal_eval(s[start:end])
                print("LLM-enhanced Recommendations:")
                for course_id in courses_list:
                    course_index = course_ids.index(course_id) if course_id in course_ids else -1
                    course_title = titles[course_index] if course_index != -1 else "Unknown Course"
                    print(f"  {course_id} - {course_title}")
            else:
                print("LLM-enhanced Recommendations: Could not parse output")
        except Exception as e:
            print(f"Error in LLM recommendations: {e}")
        
        print("\nRelevance Comments: Both methods provide semantically relevant course recommendations.")

# ======================================================
# 9. Example Run
# ======================================================
if __name__ == "__main__":
    # Test the required function
    test_profile = "I've completed the 'Python Programming for Data Science' course and enjoy data visualization."
    test_completed = ["C001"]  # Example completed course ID
    
    print("=== Testing recommend_courses function ===")
    recommendations = recommend_courses(test_profile, test_completed)
    print("Top 5 recommendations with similarity scores:")
    for course_id, score in recommendations:
        course_index = course_ids.index(course_id) if course_id in course_ids else -1
        course_title = titles[course_index] if course_index != -1 else "Unknown Course"
        print(f"  {course_id} - {course_title}: {score:.4f}")
    
    # Run evaluation
    evaluate_recommendations()

 Loaded 25 documents


  query_refine_chain = LLMChain(


=== Testing recommend_courses function ===
Top 5 recommendations with similarity scores:
  C016 - Python Programming for Data Science: 0.5562
  C014 - Data Visualization with Tableau: 0.4586
  C011 - Big Data Analytics with Spark: 0.4325
  C017 - R Programming and Statistical Analysis: 0.4286
  C004 - Computer Vision and Image Processing: 0.4001
=== EVALUATION REPORT ===

--- Test Profile 1 ---
Input: I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?
Completed Courses: ['C001']
Vector-based Recommendations (Course ID, Similarity Score):
  C016 - Python Programming for Data Science: 0.5687
  C014 - Data Visualization with Tableau: 0.4555
  C017 - R Programming and Statistical Analysis: 0.4330
  C011 - Big Data Analytics with Spark: 0.4320
  C004 - Computer Vision and Image Processing: 0.4304


  refined_query = query_refine_chain.run({"question": question}).strip()



 Refined Query: "Advanced Data Visualization courses for Python"


  docs = retriever.get_relevant_documents(refined_query)


LLM-enhanced Recommendations:
  C002 - Deep Learning with TensorFlow and Keras
  C003 - Natural Language Processing Fundamentals
  C004 - Computer Vision and Image Processing
  C005 - Reinforcement Learning Basics
  C006 - Data Engineering on AWS

Relevance Comments: Both methods provide semantically relevant course recommendations.

--- Test Profile 2 ---
Input: I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.
Completed Courses: []
Vector-based Recommendations (Course ID, Similarity Score):
  C007 - Cloud Computing with Azure: 0.5723
  C009 - Containerization with Docker and Kubernetes: 0.5063
  C008 - DevOps Practices and CI/CD: 0.4853
  C006 - Data Engineering on AWS: 0.4186
  C025 - MLOps: Productionizing Machine Learning: 0.3796

 Refined Query: "Azure container management and CI/CD pipeline courses"
LLM-enhanced Recommendations:
  C008 - DevOps Practices and CI/CD

Relevance Comments: Both methods provide semantically relevant course