In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import save_npz, load_npz
import functions
from dotenv import load_dotenv

In [6]:
#Load in data
tf_idf_matrix = load_npz("Dataset/sparse_matrix.npz")
df = pd.read_csv("Dataset/cleaned.csv",usecols = ['Id','Score_question','question','Score_answer','Body_answer'])
word_index_df = pd.read_csv('Dataset/word_to_index.csv', keep_default_na=False)
unique_words = dict(zip(word_index_df['word'], word_index_df['index']))
idf = pd.read_csv('Dataset/idf.csv', keep_default_na=False)
idf = dict(zip(idf['word'], idf['idf_score']))
df = df.dropna()
unique_df = df[['question',"Id"]].drop_duplicates()

In [7]:
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Expand contractions
    text = functions.expand_contractions(text)
    
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercase and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

Returns the most similar question, and the highest rated answer for that question.

In [8]:
def chatbot_reply(user_query):
    # Transform user query into TF-IDF vector using the same steps we did on the dataframe
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity across whole df
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    
    # Find the most similar question
    idx = similarity.argmax()
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'] == Id].sort_values('Score_answer',ascending=False).iloc[0]
    # Retrieve the best matching Q&A
    return best_ans["Body_answer"], best_ans["question"], similarity[idx]

Returns the n most similar questions, and their corresponding answers with a score>=score_req. Returns a dataframe, with all the original columns and similarity.

In [9]:
def top_n_results(user_query,n=1,score_req = 10):
    # Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data,cols = [],[]
    for word,count in tf.items():
        if word in unique_words:
            data.append((count/length)*idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix((data, ([0]*len(cols), cols)), shape=(1, len(unique_words)))

    # Compute cosine similarity across whole df
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    # Find the most similar question(s)
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({'Id':unique_df.iloc[idx]['Id'], 'Similarity':values})
    Id = unique_df.iloc[idx]['Id']
    best_ans = df[df['Id'].isin(Id)]
    best_ans = best_ans[best_ans['Score_answer']>=score_req]
    return best_ans.merge(idx_values, on='Id')

In [10]:
temp = top_n_results("How to print a backslash in Python?",n=5,score_req=10)['Body_answer']

Returns the n most similar questions with a score>=lowest_question_score, and their corresponding answers with a score>=score_req, an upper_bound>=score_ratio>=lower_bound. Returns a dataframe, with all the original columns, as well as similarity and Score_ratio.

In [11]:
def top_n_interval_filtered(user_query,n=1,score_req=10,lower_bound=0,upper_bound=1,lowest_question_score=0):
    # Transform user query into TF-IDF vector
    user_query = preprocess_text(user_query).split()
    tf = Counter(user_query)
    length = len(user_query)
    data, cols = [], []
    for word, count in tf.items():
        if word in unique_words:
            data.append((count / length) * idf[word])
            cols.append(unique_words[word])
    query_vec = csr_matrix(
        (data, ([0] * len(cols), cols)),
        shape=(1, len(unique_words))
    )
     # Compute cosine similarity across whole df
    similarity = cosine_similarity(query_vec, tf_idf_matrix).flatten()
    # Find the most similar question(s)
    idx = np.argsort(-similarity)[:n]
    values = -np.sort(-similarity)[:n]
    idx_values = pd.DataFrame({
        'Id': unique_df.iloc[idx]['Id'],
        'Similarity': values
    })
    top_answers = df[df['Id'].isin(idx_values['Id'])]
    top_answers = top_answers[top_answers['Score_answer'] >= score_req] #Filter on score_req for answer
    merged = top_answers.merge(idx_values, on='Id')
    merged['Score_ratio'] = abs(merged["Score_question"]).div(merged["Score_answer"].dropna())
    # Filter on score ratio and question score
    filtered = merged[
        (merged["Score_ratio"] >= lower_bound) &
        (merged["Score_ratio"] <= upper_bound) &
        (merged["Score_question"] > lowest_question_score)
    ]
    return filtered.sort_values(by="Similarity", ascending=False)

Creates a prompt for an LLM to answer user question

In [12]:
def create_summary_string(user_query):
    results = top_n_interval_filtered(user_query,n=5,score_req=10,lower_bound=0,upper_bound=1.5,lowest_question_score=0)
    str = ''
    count = 0
    for ans in temp:
        count+=1
        str += f'Answer {count}:\n{ans}\n\n'
    #work on prompt
    instructions = 'Answer this question based on the following answers from Stack Overflow. If none of the answers are relevant, or there are no answers respond with "I do not know".\n\n'
    str = instructions + f'Question:\n{user_query}\n\n' + str
    return str

In [13]:
create_summary_string("how to print escape character")

'Answer this question based on the following answers from Stack Overflow. If none of the answers are relevant, or there are no answers respond with "I do not know".\n\nQuestion:\nhow to print escape character\n\nAnswer 1:\nNo need to use str.replace or string.replace here, just convert that string to a raw string:\n\n>>> strs = r"C:\\Users\\Josh\\Desktop\\20130216"\n           ^\n           |\n       notice the \'r\'\n\n\nBelow is the repr version of the above string, that\'s why you\'re seeing \\\\ here.\nBut, in fact the actual string contains just \'\\\' not \\\\.\n\n>>> strs\n\'C:\\\\Users\\\\Josh\\\\Desktop\\\\20130216\'\n\n>>> s = r"f\\o"\n>>> s            #repr representation\n\'f\\\\o\'\n>>> len(s)   #length is 3, as there\'s only one `\'\\\'`\n3\n\n\nBut when you\'re going to print this string you\'ll not get \'\\\\\' in the output.\n\n>>> print strs\nC:\\Users\\Josh\\Desktop\\20130216\n\n\nIf you want the string to show \'\\\\\' during print then use str.replace:\n\n>>> new_s

In [14]:
load_dotenv()
gemini_api_key = os.getenv("api_key")
from google import genai

client = genai.Client(api_key=gemini_api_key)

AI learns patterns from data to make decisions or predictions.


Answers user question based on retrived answers through gemeni 2.5.

In [15]:
def get_gemini_response(user_query):
    prompt = create_summary_string(user_query)
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
    )
    return response.text

In [16]:
get_gemini_response("how to print escape character")

'To print an escape character, specifically a backslash (`\\`), you need to escape it by preceding it with another backslash.\n\n*   **To print a single backslash:**\n    ```python\n    print "\\\\"\n    ```\n    The `\\` character is an escape character itself, so `\\\\` tells the interpreter to treat the second `\\` as a literal backslash rather than the start of an escape sequence.\n\n*   **To print a string containing multiple backslashes (e.g., file paths):**\n    You have a couple of options:\n    1.  **Escape each backslash:** Use `\\\\` for every literal backslash you want in the string.\n        ```python\n        path = "C:\\\\Users\\\\Josh\\\\Desktop"\n        print path\n        # Output: C:\\Users\\Josh\\Desktop\n        ```\n    2.  **Use a raw string:** Declare the string as raw by placing an `r` before the opening quote. In a raw string, backslashes are treated literally and are not interpreted as escape characters. This is often cleaner for paths or regular expressions