In [51]:
import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key= os.getenv('OPENAI_API_KEY')
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-3.5-turbo",
)

In [37]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the environment variables from the .env file
load_dotenv()

# Access your API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)

In [62]:


# Function to load documents from a given directory
def load_documents(directory):
    documents = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                documents.append(file.read())  # Changed from extend to append
    return documents

In [39]:
# Function to retrieve most similar documents based on a query
def retrieve(query, vectorizer, tfidf_matrix, data, top_k=3):
    query_tf = vectorizer.transform([query])
    similarities = cosine_similarity(query_tf, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [(data[i], similarities[i]) for i in top_indices]

In [63]:




# Function to answer questions using retrieved texts
def answer_question(question, documents, vectorizer, tfidf_matrix, model, top_k=3, max_tokens=150, stop_sequence=None):
    retrieved_texts = retrieve(question, vectorizer, tfidf_matrix, documents, top_k=top_k)
    context = " ".join([text for text, _ in retrieved_texts])
    

    if context:  # Check if there is any context retrieved
        try:
            # Create a chat completion using the question and context
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "Answer the question based on the context below"},
                    {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
                ],
                temperature=0,
                max_tokens=max_tokens,
                stop=stop_sequence,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return str(e)
    else:
        return "No relevant context found for the question."




In [66]:
# Load the documents
documents = load_documents('RAG_DATA')
# Prepare the vectorizer and fit it to the documents
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

# Example usage of answering a question
query = "What are the core classes for the computer science major?"
model_name = "gpt-3.5-turbo"  # You should define your model name as a variable
answer = answer_question(query, documents, vectorizer, tfidf_matrix, model=model_name)
print(answer)

The core classes for the computer science major at Tulane University are:

1. CMPS 1500 - Intro to Computer Science I (4 c.h.)
2. CMPS 1600 - Intro to Computer Science II (4 c.h.)
3. CMPS 2170 - Intro to Discrete Math (3 c.h.)
4. CMPS 2200 - Intro to Algorithms (3 c.h.)
5. CMPS 2300 - Intro to Comp Sys & Networking (3 c.h.)
