In [7]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
# Download stopwords/lemmatizer if not already
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\namis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\namis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Data: Example FAQ
faq_data = [
    {"Question": "How much is the admission fee?", "Answer": "Admission fee is 5000."},
    {"Question": "How can I apply for a hostel?", "Answer": "Fill the hostel form online at hostel.university.edu."},
    {"Question": "When will exams start?", "Answer": "Exams will begin in December as per the academic calendar."}
]
df = pd.DataFrame(faq_data)

In [10]:
# Text Preprocessing Function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Processed"] = df["Question"].apply(preprocess)

In [11]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Processed"])

def answer_query(query):
    query_processed = preprocess(query)
    query_vec = vectorizer.transform([query_processed])
    scores = cosine_similarity(query_vec, tfidf_matrix)
    idx = np.argmax(scores)
    return df.iloc[idx]["Answer"]

In [12]:
# Example
user_input = "What is the admission fee?"
print("Bot:", answer_query(user_input))

Bot: Admission fee is 5000.
