In [30]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
import pickle

Example notation for API

In [31]:
'''
import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key= os.getenv('OPENAI_API_KEY')
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-3.5-turbo",
)
'''

'\nimport os\nfrom openai import OpenAI\n\nclient = OpenAI(\n    # This is the default and can be omitted\n    api_key= os.getenv(\'OPENAI_API_KEY\')\n)\n\nchat_completion = client.chat.completions.create(\n    messages=[\n        {\n            "role": "user",\n            "content": "Say this is a test",\n        }\n    ],\n    model="gpt-3.5-turbo",\n)\n'

In [32]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the environment variables from the .env file
load_dotenv()

# Access your API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)

Old Load

Pre-processing text function to use on documents, in order to generate more accurate embeddings for the documents

In [33]:
# Load spaCy's English NLP model
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Parse the document using spaCy
    doc = nlp(text.lower())  # Convert text to lower case

    # Remove punctuation and stop words, and apply lemmatization
    clean_tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]

    # Join the tokens back into a single string
    clean_text = ' '.join(clean_tokens)
    
    return clean_text


Loading Context Files from Tulane Website

In [34]:
def load_and_preprocess_documents(directory):
    documents = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                processed_text = preprocess_text(text)
                documents.append(processed_text)
    return documents

# Load and preprocess all documents
documents = load_and_preprocess_documents('RAG_DATA')




KeyboardInterrupt: 

If TFIDF Doesn't work, bert embeddings could be good

In [35]:

def save_documents(documents, filepath):
    with open(filepath, 'wb') as file:
        pickle.dump(documents, file)

def load_documents(filepath):
    with open(filepath, 'rb') as file:
        return pickle.load(file)

# Load and preprocess all documents

save_documents_path = 'processed_documents.pkl'

# Save the processed documents to a file
save_documents(documents, save_documents_path)

In [36]:
documents

['biochemistry apply bioinformatic ms overview requirement year thesis require program study lead master science degree biochemistry apply bioinformatic year student acquire academic foundation biochemistry bioinformatic year specialize sub field befit research employment interest program design improve academic credential scientific research experience graduate distinctive program emphasize student development area coursework laboratory skill bioinformatic analysis independent thought presentation skill personal growth allow student broaden strengthen academic foundation equip student basic advanced lab bioinformatics skill career academic industrial research student graduate biochemistry cell biology biostatistic bioinformatic course strong emphasis research application biochemical molecular bioinformatic knowledge \xa0  bioinformatic training focus skill application diverse tool database address genomic gene expression proteomic metabolism protein structure function drug binding \xa

Old retreive

In [37]:
# Function to retrieve most similar documents based on a query
def retrieve(query, vectorizer, tfidf_matrix, data, top_k=3):
    # Validate inputs
    if not data or top_k <= 0:
        return []
    
    try:
        # Transform the query to the same vector space as the documents
        query_tf = vectorizer.transform([query])
        
        # Calculate cosine similarities between the query and all documents
        similarities = cosine_similarity(query_tf, tfidf_matrix).flatten()
        
        # Determine the number of documents to return
        if top_k is None or top_k > len(data):
            top_k = len(data)  # If top_k is None or more than the available docs, return all documents
        
        # Get the indices of the documents with the highest similarities
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        # Retrieve the top_k most similar documents and their similarity scores
        return [(data[i], similarities[i]) for i in top_indices]
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


Retreival Function 

In [38]:
def retrieve(query, vectorizer, tfidf_matrix, data, top_k=3):
    # Validate inputs
    if not data or top_k <= 0:
        return []

    try:
        # Transform the query to the same vector space as the documents
        query_tf = vectorizer.transform([query])
        
        # Calculate cosine similarities between the query and all documents
        similarities = cosine_similarity(query_tf, tfidf_matrix).flatten()

        # Tokenize the query into keywords
        query_keywords = set(query.lower().split())

        # Prepare a list to store matches and their combined scores
        matches = []

        # Iterate over each document entry
        for i, document in enumerate(data):
            # Extract title from the document assuming it's the first sentence before the comma
            title = document.split(',')[0].lower()
            title_keywords = set(title.split())

            # Calculate the number of query keywords that appear in the title
            common_keywords = query_keywords.intersection(title_keywords)
            keyword_count = len(common_keywords)

            # Calculate a combined score
            # Here, you might want to balance the importance of cosine similarity and keyword count
            # For example, you could give a weight to keyword matches to adjust their influence
            combined_score = similarities[i] + (keyword_count * 0.5)  # Adjust the weight (0.1) as needed

            # Store the document along with its combined score
            matches.append((document, combined_score))

        # Sort by the combined scores in descending order
        matches.sort(key=lambda x: x[1], reverse=True)

        # Return the top_k most relevant documents based on the combined scores
        return matches[:top_k]

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

Using Bert Embeddings

Question answering function employeeing gpt api

In [39]:




# Function to answer questions using retrieved texts
def answer_question(question, documents, vectorizer, tfidf_matrix, model, top_k=5, max_tokens=200, stop_sequence=None):
    retrieved_texts = retrieve(question, vectorizer, tfidf_matrix, documents, top_k=top_k)
    context = " ".join([text for text, _ in retrieved_texts])
    

    if context:  # Check if there is any context retrieved
        try:
            # Create a chat completion using the question and context
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "Answer the question based on the context below"},
                    {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
                ],
                temperature=0,
                max_tokens=max_tokens,
                stop=stop_sequence,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return str(e)
    else:
        return "No relevant context found for the question."



Actually Loading Document Data To prepare for use

In [43]:
# Load the documents
#documents = load_and_preprocess_documents('RAG_DATA')

In [40]:
documents = load_documents("processed_documents.pkl")

Using TFidf

In [41]:
# Initialize and fit the TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, min_df = 2, stop_words = "english")
tfidf_matrix = vectorizer.fit_transform(documents)

Using Loader


Usage

In [42]:
# Example usage of answering a question
query = "What are the required computer science classes for the major"
model_name = "gpt-3.5-turbo"  
answer = answer_question(query, documents, vectorizer, tfidf_matrix, model=model_name)
print("Query:", query)
print(answer)

Query: What are the required computer science classes for the major
The required computer science classes for the major include:
- CMP 1500: Introduction to Computer Science
- CMP 1600: Introduction to Computer Science II
- CMP 2170: Introduction to Discrete Math
- CMP 2200: Introduction to Algorithms
- CMP 2300: Introduction to Computer Systems and Networking

Additionally, students must select two courses at the 3000 level from a list of options, such as:
- CMP 3130/6130: Introduction to Computational Geometry
- CMP 3140/6140: Introduction to Artificial Intelligence
- CMP 3160/6160: Introduction to Data Science
- CMP 3210/6210: Algorithms in Computational Structural Biology
- CMP 3240/6240: Introduction to Machine Learning

For the senior level, students must complete a capstone project consisting of:
- CMP 4010: Capstone Project I
- CMP 4020: Capstone Project II

These courses


Improvements that can be added once it's working

In [16]:
# Initialize tokenizer and model from pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to evaluation mode

def get_bert_embeddings(documents, tokenizer, model):
    embeddings = []
    for text in documents:
        # Tokenize the text and prepare input tensors
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding='max_length')
        with torch.no_grad():
            # Generate embeddings
            outputs = model(**inputs)
            # Get the embeddings of the [CLS] token
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
            embeddings.append(cls_embedding)

    # Convert the list of embeddings into a matrix
    embeddings_matrix = np.stack(embeddings)
    return embeddings_matrix, tokenizer, model

def bert_vectorize(query, tokenizer, model):
    # Tokenize the query and prepare input tensors
    inputs = tokenizer(query, return_tensors='pt', truncation=True, max_length=512, padding='max_length')
    with torch.no_grad():
        # Generate embeddings using BERT
        outputs = model(**inputs)
        # Get the embeddings of the [CLS] token
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

def retrieve_bert(query, tokenizer, model, embeddings_matrix, data, top_k=3):
    # Validate inputs
    if not data or top_k <= 0:
        return []

    try:
        # Vectorize the query using BERT
        query_vec = bert_vectorize(query, tokenizer, model)

        # Calculate cosine similarities between the query vector and all document embeddings
        similarities = cosine_similarity([query_vec], embeddings_matrix).flatten()

        # Prepare a list to store matches and their combined scores
        matches = []

        # Iterate over each document entry
        for i, document in enumerate(data):
            # Calculate a score based on cosine similarity
            combined_score = similarities[i]

            # Store the document along with its combined score
            matches.append((document, combined_score))

        # Sort by the combined scores in descending order
        matches.sort(key=lambda x: x[1], reverse=True)

        # Return the top_k most relevant documents based on the combined scores
        return matches[:top_k]

    except Exception as e:
        print(f"An error occurred: {e}")
        return []


query = "Acounting Minor"
embeddings_matrix = get_bert_embeddings(documents, tokenizer, model)

top_k_results = retrieve_bert(query, tokenizer, model, embeddings_matrix, documents, top_k=3)

KeyboardInterrupt: 

Continuous chatbot

In [10]:
import uuid

def generate_session_id():
    return str(uuid.uuid4())

In [21]:
def answer_question(session_id, question, documents, vectorizer, tfidf_matrix, model, top_k=3, max_tokens=200, stop_sequence=None):
    # Assuming a global or higher scope session management
    if session_id not in sessions:
        sessions[session_id] = []  # Initializes conversation history

    # Add user's question to session history
    sessions[session_id].append({'role': 'user', 'content': question})

    # Retrieve contextually relevant texts based on the question
    retrieved_texts = retrieve(question, vectorizer, tfidf_matrix, documents, top_k=top_k)
    context = " ".join([text for text, _ in retrieved_texts])

    if context:
        # Prepare messages for the API call including historical context
        messages = sessions[session_id] + [{'role': 'system', 'content': f"Please consider the context: {context}"}]

        try:
            # Create a chat completion using the question and the accumulated context
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,
                max_tokens=max_tokens,
                stop=stop_sequence,
            )

            # Extract the content from the response and add it to the session
            answer = response.choices[0].message.content
            sessions[session_id].append({'role': 'system', 'content': answer})
            return answer
        except Exception as e:
            return f"An error occurred: {e}"
    else:
        no_context_reply = "No relevant context found for the question."
        sessions[session_id].append({'role': 'system', 'content': no_context_reply})
        return no_context_reply

# Example global session storage
sessions = {}

In [22]:
# Example usage of answering a question
query = "What are the required computer science classes for the major"
model_name = "gpt-3.5-turbo"  
id = generate_session_id()
answer = answer_question(id, query, documents, vectorizer, tfidf_matrix, model=model_name)
print("Query:", query)
print(answer)

Query: What are the required computer science classes for the major
The required computer science classes for the major in Digital Medium Practice include:

- DMPC 2001: Introduction to Digital Filmmaking
- DMPC 2002: Narrative Filmmaking
- DMPC 5550: Advanced Digital Filmmaking
- DMPC 5560: Advanced Digital Filmmaking II

These courses provide students with a foundation in digital filmmaking, narrative storytelling, and advanced techniques in digital media production. Students also have the option to select elective courses from a list of approved options in areas such as game studies, experimental game design, podcasting, and more.
