# Project Name: Student Success Chatbot

## Project Synopsis

Student Success Advisors (SSAs) are currently overwhelmed by low-complexity, high-volume inquiries. A reduction in SSA staffing highlights the urgent need for a scalable solution. This project proposes the design and implementation of a chatbot-based self-service tool to handle frequently asked questions (FAQs), integrated with a fallback mechanism ("off-ramp") to escalate complex queries to human advisors.

## Team member names

1. Yu Chen Chou 
2. Zhimin Xiong 
3. Haysam Elamin

## Data Source
* kb1.csv: Content from the College website.
* kb2.csv: Content from the Student Success Portal website.
* kb2.csv: Content from Student_Fees_FQA_Winter_2024.pdf and RO_FQA_Winter_2024.pdf.
* student_queries.csv: synthetic dataset to train the intent classifier model.
* GoEmotions dataset from Hugging Face 🤗 Datasets library.

## Setup

In order to run the app, follow the steps as below:
* Create a Virtual Environment using Python 3.10 or 3.9.6, and restore requirements.txt
* Create a .env file with OPENAI_API_KEY
* Models for emotion classifier and intent classifier are too big to push to GitHub.
    * run notebooks\emotionModelTrainer.ipynb to train and save the emotion classifier model
    * run notebooks\intentModelTrainer.ipynb to train and save the intent classifier model

## Streamlit UI

In [None]:
import streamlit as st
import streamlit.components.v1 as components
from datetime import datetime
import streamlit as st
from datetime import datetime

from src.chatbotController import ChatbotController

# Page config
st.set_page_config(page_title="Conestoga Student Support Chatbot", page_icon="🤖")

# Hide Streamlit footer
st.markdown("<style>footer {visibility: hidden;}</style>", unsafe_allow_html=True)

# Custom CSS to make layout responsive
# Custom CSS
st.markdown("""
    <style>
        .chat-input {
            border-top: 1px solid #eee !important;
            padding-top: 10px;
        }
    </style>
""", unsafe_allow_html=True)

st.title("📚 Student Support Chatbot")

# Initialize chatbot
if "chatbot" not in st.session_state:
    st.session_state.chatbot = ChatbotController()
    st.session_state.chatbot.get_answer("hello")

# Initialize history
if "history" not in st.session_state:
    st.session_state.history = [{
                "sender": "Lulu",
                "role": "Student Success Advisor",
                "avatar": "💁‍♀️",
                "text": "Hello, my name is Lulu, how can I help you today?",
                "time": datetime.now().strftime("%H:%M")
            }]

# If input was previously stored, clear it before rendering the input box
if "clear_input" in st.session_state:
    st.session_state.chat_input = ""
    del st.session_state["clear_input"]

# --- Build the chat history HTML ---
chat_body = ""

for msg in st.session_state.history:
    if msg.get("sender") == "System":
        chat_body += f"<div style='text-align:center; color: gray; font-size: 12px; margin: 10px 0;'>{msg['text']}</div>"
    else:
        align = "left" if msg["sender"] != "You" else "right"
        bubble_color = "#daf4fa"
        avatar = msg.get("avatar", "👤")
        chat_body += f"<div style='display:flex; flex-direction:{'row' if align == 'left' else 'row-reverse'}; margin-bottom:10px;'><div style='font-size:24px; margin:0 10px;'>{avatar}</div><div><div style='font-weight:bold; font-size:13px;color:lightgray;'>{msg['sender']}</div><div style='font-size:11px; color:gray;'>{msg.get('role', '')}</div><div style='background-color:{bubble_color}; color:black; padding:10px; border-radius:10px; max-width:600px; margin-top:4px;'>{msg['text']}</div></div></div>"

# Scroll to bottom using JavaScript
full_html = f"<div id='chat-box' style='height: 400px; overflow-y: auto; border: 1px solid #ccc; padding-right: 10px; border-radius: 10px;'>{chat_body}</div><script>var chatBox = document.getElementById('chat-box'); if (chatBox) {{ chatBox.scrollTop = chatBox.scrollHeight; }}</script>"

# Render using components.html (not st.markdown)
components.html(full_html, height=400)

# Show "Advisor is typing..." message if a response is pending
if "pending_input" in st.session_state:
    st.markdown(
        "<div style='color: gray; font-style: italic; padding: 5px 0;'>💁‍♀️ Lulu is typing ...</div>",
        unsafe_allow_html=True
    )

# --- Chat input ---
st.markdown("<div class='chat-input'>", unsafe_allow_html=True)
user_input = st.text_input("Type a message here and press Enter...", label_visibility="collapsed", key="chat_input")

if "pending_input" in st.session_state:
    user_input = st.session_state.pop("pending_input")  # Remove to avoid reprocessing
    reply = st.session_state.chatbot.get_answer(user_input)

    st.session_state.history.append({
        "sender": "Lulu",
        "role": "Student Success Advisor",
        "avatar": "💁‍♀️",
        "text": reply,
        "time": datetime.now().strftime("%H:%M")
    })
    st.rerun()

if user_input:
    st.session_state.history.append({
        "sender": "You",
        "avatar": "🧑‍🎓",
        "text": user_input,
        "time": datetime.now().strftime("%H:%M")
    })

    # Store input to handle bot reply on next run
    st.session_state["pending_input"] = user_input
    st.session_state["clear_input"] = True  
    st.rerun()

st.markdown("</div>", unsafe_allow_html=True)  # Close chat-input

## Chatbot Controller

In [None]:
from src.handlers.answerGenerator import AnswerGenerator
from src.handlers.emotionClassifier import EmotionClassifier
from src.handlers.intentClassifier import IntentClassifier
from src.handlers.interactionLogger import InteractionLogger
from src.handlers.searchEngine import FaissSearchEngine
import concurrent.futures

class ChatbotController:
    """
    ChatbotController orchestrates the main components of the student-facing chatbot system.
    
    It integrates intent classification, emotion detection, vector-based knowledge retrieval,
    answer generation, and interaction logging. The controller processes incoming student queries
    by identifying their intent and emotional state in parallel. If distress is detected, it
    responds with a referral to a human advisor. Otherwise, it retrieves relevant context from the
    knowledge base and generates a user-friendly answer using a language model.
    """
    def __init__(self):
        self.intentClassifier = IntentClassifier()
        self.emotionClassifier = EmotionClassifier()
        self.answer_generator = AnswerGenerator()
        self.vector_search = FaissSearchEngine()
        self.logger = InteractionLogger()

    def get_knowledge_base(self, query):
        kbResults = self.vector_search.search(query, top_k=10)
        # Combine top-k chunks into a single context string
        context = "\n\n".join([f"{chunk['content']}" for chunk, _ in kbResults])
        if len(context) > 10000:
            context = context[:10000]
        return context

    def get_answer(self, query):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            emotionPrediction = executor.submit(self.emotionClassifier.predict, query)
            intentPrediction = executor.submit(self.intentClassifier.predict, query)

            emotion = emotionPrediction.result()
            intent = intentPrediction.result()

        # escalate to human agent
        if (emotion is not None and emotion in ["anger", "sadness", "fear", "disgust"]):
            # log
            self.logger.log('student_123',query,intent,emotion,"")
            return "I'm really sorry you're feeling this way. You don’t have to go through it alone. Please speak with a Student Success Advisor who can support you. You can book an appointment at <a href='https://collegeportal.edu/ssa-booking'>https://collegeportal.edu/ssa-booking</a> or call us directly at 555-123-4567."
        # get context from knowledge base
        context = self.get_knowledge_base(query)
        # Generate answer
        answer = self.answer_generator.generate_answer_with_openai(context, query)
        # log
        self.logger.log('student_123',query,intent,emotion, answer)
        return answer

## Intent Classifier

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

class IntentClassifier():
    """
    Classifies the intent behind a student's query using a fine-tuned BERT model.

    This class loads a pre-trained intent classification model and predicts the most 
    likely intent category from a predefined list of student-related intents such as 
    course information or enrollment.

    Methods:
        predict(text):
            Returns the predicted intent label for the given text input.
    """
    def __init__(self, model_path="models/intentClassifier"):
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.model.eval()
        self.labels = [
            "Course Information",
            "Enrollment / Course Registration",
            "Withdrawal or Drop Course",
            "Access Issues (portal/login)",
            "Technical Support",
            "Tuition/Fees Inquiry",
            "Scholarship/Financial Aid",
            "Mental Health Concerns",
            "Stress or Burnout",
            "Bullying or Harassment",
            "Administrative Support",
            "Campus Facilities",
            "Housing/Accommodation",
            "Extracurricular Activities",
            "General Complaint"
        ]

    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            predicted_class_id = outputs.logits.argmax().item()
        return self.labels[predicted_class_id]


## Emotion Classifier

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

class EmotionClassifier:
    """
    Classifies emotional tone in a given text using a fine-tuned BERT model.

    This class loads a pre-trained emotion classification model (fine-tuned on a subset of 
    negative emotions) and predicts the most probable emotion label from a fixed list, 
    if the confidence exceeds a specified threshold.

    Methods:
        predict(text, threshold=0.9):
            Returns the predicted emotion label if the model's confidence exceeds the threshold;
            otherwise, returns None.
    """
    def __init__(self, model_path="models/emotionClassifier"):
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.model.eval()
        self.labels: list = ["sadness", "grief", "fear", "remorse", "disappointment", "nervousness", "embarrassment"]

    def predict(self, text, threshold=0.85):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = F.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()

        for label, prob in zip(self.labels, probs):
            if prob >= threshold:
                return label
        return None

## Search Engine

In [None]:
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer

class FaissSearchEngine:
    """
    Provides semantic search functionality over a pre-built FAISS index of text chunks.

    This class loads a FAISS index and its associated metadata, and uses a sentence embedding 
    model (MiniLM) to perform efficient vector-based similarity search. Duplicate content is 
    filtered to ensure diverse results.

    Methods:
        search(query: str, top_k: int = 5) -> List[Tuple[Dict, float]]:
            Returns top-k semantically relevant results for a given query, excluding duplicates.
    """
    def __init__(self, index_path='models/faiss.index', meta_path='models/texts.pkl'):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.texts = None
        self._load_index(index_path, meta_path)

    def _load_index(self, index_path, meta_path):
        self.index = faiss.read_index(index_path)
        with open(meta_path, 'rb') as f:
            self.texts = pickle.load(f)

    def search(self, query, top_k=5):
        query_embedding = self.model.encode([query])
        query_embedding = np.array(query_embedding).astype('float32')
        D, I = self.index.search(query_embedding, top_k * 2)
        seen = set()
        results = []
        for rank, i in enumerate(I[0]):
            item = self.texts[i]
            content = item['content']
            if content not in seen:
                seen.add(content)
                results.append((item, D[0][rank]))
            if len(results) == top_k:
                break
        return results


## Answer Generator

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os

class AnswerGenerator:
    """
    Generates natural language answers to student queries using large language models (LLMs).

    This class supports integration with OpenAI (e.g., GPT-4) to provide student support responses 
    based on retrieved knowledge base context.

    The answers are generated in the role of a student success advisor, with a consistent prompt 
    guiding the model to use provided context and gracefully handle unknowns.

    Methods:
        generate_answer_with_openai(context, question, model):
            Uses OpenAI's ChatCompletion API to generate a response based on the given context and question.
    """
    def __init__(self):
        load_dotenv()
        api_key = os.getenv("OPENAI_API_KEY")
        self.client = OpenAI(api_key=api_key)

    def generate_answer_with_openai(self, context: str, question: str, model: str = "gpt-4") -> str:
        response = self.client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful student success adivor. Use the provided context to answer the student's question. If the answer is not in the context, answer it with the best of your knowledge.'"
                },
                {
                    "role": "user",
                    "content": f"Context:\n{context}\n\nQuestion:\n{question}"
                }
            ],
            temperature=0.3,
        )

        return response.choices[0].message.content.strip()


## Interation Logger

In [None]:
import csv
from datetime import datetime
import os

class InteractionLogger:
    """
    Logs chatbot interactions to a CSV file for record-keeping and analysis.

    This class handles the creation and maintenance of a log file that records 
    student interactions with the chatbot, including timestamp, student ID, 
    question, predicted intent, and detected emotion.

    Methods:
        log(student, question, intent, emotion):
            Appends a new interaction entry to the log file with the current timestamp.
    """
    def __init__(self, log_file='logs/log.csv'):
        self.log_file = log_file
        # Create the file with headers if it doesn't exist
        if not os.path.exists(self.log_file):
            with open(self.log_file, mode='w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['datetime', 'student', 'question', 'intent', 'emotion', 'answer'])

    def log(self, student, question, intent, emotion, answer):
        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.log_file, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([now, student, question, intent, emotion, answer])

## Dashboard

In [None]:
# Import libraries
import pandas as pd
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt

distressed_emotions = ["anger", "sadness", "fear", "disgust"]
# Load log CSV file
df = pd.read_csv('logs/log.csv', parse_dates=['datetime'])
df["is_distressed"] = df['emotion'].isin(distressed_emotions)
st.title("📊 Chatbot Dashboard")

# Intent type distribution map
st.header("✨ FAQ Intent Type Distribution")
intent_counts = df['intent'].value_counts()
# Sort values
intent_counts = intent_counts.sort_values(ascending=False)
# Create figure
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=intent_counts.values, y=intent_counts.index, ax=ax, palette='Blues_d')
# Labels and title
ax.set_xlabel('Count')
ax.set_ylabel('Intent Type')
ax.set_title('Intent Type Distribution')
# Layout and display
plt.tight_layout()
st.pyplot(fig)

# Percentage of troublesome messages
st.header("⚠️ Percentage of Disturbing Messages")
distress_counts = df['is_distressed'].value_counts().rename({True: 'Distressed', False: 'Not Distressed'})

fig2, ax2 = plt.subplots(figsize=(10, 6))
ax2.bar(distress_counts.index, distress_counts.values, color=['red', 'green'])
ax2.set_ylabel('Count')
ax2.set_title('Distress Message Proportion')
st.pyplot(fig2)

# Heat map of troublesome messages
st.header("📆 Daily Distress Information Heatmap")
df['date'] = df['datetime'].dt.date
pivot = df.pivot_table(index='date', columns='is_distressed', values='student', aggfunc='count', fill_value=0)
pivot.rename(columns={False: 'Not Distressed', True: 'Distressed'}, inplace=True)

fig3, ax3 = plt.subplots(figsize=(12, 8))
sns.heatmap(pivot, annot=True, fmt="d", cmap='YlOrRd', ax=ax3)
ax3.set_title('Daily Distress Messages Heatmap')
st.pyplot(fig3)

# Latest trouble information list
st.header("🚨 Latest Troubling Messages")
distressed = df[df['is_distressed']]
st.dataframe(distressed[['datetime', 'student', 'question', 'intent']].sort_values('datetime', ascending=False))

## Vector Index Builder

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import os
import pickle
from glob import glob

class VectorIndexBuilder:
    """
    Builds and saves a FAISS vector index from CSV files containing text chunks.

    This class uses the SentenceTransformer model ('all-MiniLM-L6-v2') to generate 
    dense vector embeddings for textual content extracted from CSV files in data folder.
    Each CSV is expected to contain 'url', 'chunk_number', and 'content' columns and has name like kb*.csv.

    The resulting embeddings are indexed using FAISS (IndexFlatL2), enabling efficient
    vector-based semantic search. Both the FAISS index and associated metadata (text and source info)
    are saved to disk for later retrieval and use.
    """
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.texts = []
    
    def build_index_from_folder(self, folder_path):
        csv_files = glob(os.path.join(folder_path, "kb*.csv"))
        all_texts = []
        all_records = []

        for path in csv_files:
            df = pd.read_csv(path)
            if {'url', 'chunk_number', 'content'}.issubset(df.columns):
                df = df.fillna('')  # Ensure no NaNs
                all_texts.extend(df['content'].astype(str).tolist())
                all_records.extend(df[['url', 'chunk_number', 'content']].to_dict(orient='records'))

        self.texts = all_records  # Store metadata for each chunk
        embeddings = self.model.encode(all_texts, show_progress_bar=True)
        embeddings = np.array(embeddings).astype('float32')  # FAISS needs float32
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)

    def save_index(self, index_path='models/faiss.index', meta_path='models/texts.pkl'):
        faiss.write_index(self.index, index_path)
        with open(meta_path, 'wb') as f:
            pickle.dump(self.texts, f)

builder = VectorIndexBuilder()
builder.build_index_from_folder("data")
builder.save_index()
print("✅ Index and metadata saved.")

## 