<a href="https://colab.research.google.com/github/tesims/Recommendation-Systems-Similarity-Scoring/blob/main/Recommendation_Systems_Similarity_Scoring_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<h2>FAQ Question via Similarity Scoring</h2>



In [None]:
import csv
import math
import pandas as pd
from google.colab import files


# Define a list of stop words
remove_stop_words = ['a', 'an', 'and', 'as', 'at', 'for', 'from', 'is', 'it', 'of', 'on', 'the', 'to', 'with']

# Define shared question words
question_words = ["who", "what", "when", "where", "why", "how", "which", "whom", "whose", "whether", "if", "is", "are", "was", "were", "am", "can", "could", "will", "would", "should", "do", "does", "did", "has", "have", "had", "can", "could", "shall", "should", "will", "would", "may", "might", "must", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "cannot", "couldn't", "let's"]

def upload_file():
  uploaded = files.upload()
  return uploaded

def csv_to_df(uploaded):
    file_name = list(uploaded.keys())[0]

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_name)

    if len(df.columns) < 2:
      raise ValueError("The CSV file must have at least two columns.")

    return df

def get_matching_value(input_string, dataframe):
    # Search for a matching string in the first column
    if(input_string is not None):
      matching_row = dataframe[dataframe.iloc[:, 0] == input_string]

      # Check if a match is found
      if not matching_row.empty:
        # Return the corresponding value from the second column
        return matching_row.iloc[0, 1]
      else:
        return None

    else:
      return "I don't know."


def count_vectorizer_from_csv(uploaded, stop_words=None, question_words=None):
    # Upload the CSV file in Google Colab

    file_name = list(uploaded.keys())[0]

    # Read the CSV file
    with open(file_name, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header if present
        corpus = [row[0] for row in reader]

    # Create a vocabulary (list of unique words)
    if stop_words is not None:
        vocabulary = list(set(word for doc in corpus for word in doc.split()))
    else:
        vocabulary = list(set(word for doc in corpus for word in doc.split() if word.lower() not in stop_words))

    # Initialize a matrix with zeros
    count_matrix = []

    # Fill the matrix with word counts, giving higher weight to specific words
    for i, doc in enumerate(corpus):
        doc_vector = [doc.split().count(word) * (2 if word.lower() in question_words else 1) for word in vocabulary]
        count_matrix.append(doc_vector)

    return count_matrix, vocabulary, corpus

# Calculated using the angle between the two vectors being compared
def cosine_similarity(vector1, vector2):
    dot_product = sum(a*b for a, b in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(a**2 for a in vector1))
    magnitude2 = math.sqrt(sum(a**2 for a in vector2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)

# Defined as the size of the intersection divided by the size of the union of two sets.
# Suitable for cases where the order of elements is not important.
# Particularly used for comparing sets.
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    else:
        return intersection / union

# Calculate TF-IDF similarity
def tfidf_similarity(query_vector, doc_vector, idf_values):
    dot_product = sum(a * b * idf_values[i] for i, (a, b) in enumerate(zip(query_vector, doc_vector)))
    magnitude1 = math.sqrt(sum(a**2 * idf_values[i]**2 for i, a in enumerate(query_vector)))
    magnitude2 = math.sqrt(sum(b**2 * idf_values[i]**2 for i, b in enumerate(doc_vector)))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)

def calculate_similarity_scores(query, count_matrix, vocabulary, corpus):
    # Vectorize the query sentence
    query_vector = [query.split().count(word) for word in vocabulary]

    # Calculate cosine similarity scores
    cosine_similarity_scores = [cosine_similarity(query_vector, doc_vector) for doc_vector in count_matrix]

    # Calculate Jaccard similarity scores
    query_set = set(query.split())
    jaccard_similarity_scores = [jaccard_similarity(query_set, set(doc.split())) for doc in corpus]

    # Calculate TF-IDF similarity scores
    idf_values = [math.log(len(corpus) / sum(1 for doc in corpus if word in set(doc.split()))) for word in vocabulary]
    tfidf_similarity_scores = [tfidf_similarity(query_vector, doc_vector, idf_values) for doc_vector in count_matrix]

    # Create dictionaries of sentences and their similarity scores
    cosine_similarity_dict = {corpus[i]: cosine_similarity_scores[i] for i in range(len(corpus))}
    jaccard_similarity_dict = {corpus[i]: jaccard_similarity_scores[i] for i in range(len(corpus))}
    tfidf_similarity_dict = {corpus[i]: tfidf_similarity_scores[i] for i in range(len(corpus))}

    return cosine_similarity_dict, jaccard_similarity_dict, tfidf_similarity_dict

def find_most_similar_question(user_question, count_matrix, vocabulary, corpus):
    # Vectorize the user question
    user_vector = [user_question.split().count(word) for word in vocabulary]

    # Calculate cosine similarity scores
    cosine_similarity_scores = [cosine_similarity(user_vector, doc_vector) for doc_vector in count_matrix]

    # Calculate Jaccard similarity scores
    user_set = set(user_question.split())
    jaccard_similarity_scores = [jaccard_similarity(user_set, set(doc.split())) for doc in corpus]

    # Calculate TF-IDF similarity scores
    idf_values = [math.log(len(corpus) / sum(1 for doc in corpus if word in set(doc.split()))) for word in vocabulary]
    tfidf_similarity_scores = [tfidf_similarity(user_vector, doc_vector, idf_values) for doc_vector in count_matrix]

    # Find the indices of the most similar questions
    most_similar_cosine_index = cosine_similarity_scores.index(max(cosine_similarity_scores))
    most_similar_jaccard_index = jaccard_similarity_scores.index(max(jaccard_similarity_scores))
    most_similar_tfidf_index = tfidf_similarity_scores.index(max(tfidf_similarity_scores))

    # Return the indices of the most similar questions for all similarity measures
    return most_similar_cosine_index, most_similar_jaccard_index, most_similar_tfidf_index

def compare_similarities(cosine_index, jaccard_index, tfidf_index, corpus):
    num_questions = len(corpus)

    # Check if two or more methods produce the same question
    if 0 <= cosine_index < num_questions and cosine_index == jaccard_index == tfidf_index:
        print("Most Similar Question (All Methods):")
        print(corpus[cosine_index])
        return corpus[cosine_index]
    elif 0 <= cosine_index < num_questions and cosine_index == jaccard_index:
        print("Most Similar Question (Cosine and Jaccard):")
        print(corpus[cosine_index])
        return corpus[cosine_index]
    elif 0 <= cosine_index < num_questions and cosine_index == tfidf_index:
        print("Most Similar Question (Cosine and TF-IDF):")
        print(corpus[cosine_index])
        return corpus[cosine_index]
    elif 0 <= jaccard_index < num_questions and jaccard_index == tfidf_index:
        print("Most Similar Question (Jaccard and TF-IDF):")
        print(corpus[jaccard_index])
        return corpus[jaccard_index]
    else:
        print("We were unable to find the answer to your question.")
        return None

# Apply the count vectorizer from uploaded CSV file
uploaded = upload_file()

dataframe = csv_to_df(uploaded)
count_vector_matrix, vocabulary, corpus = count_vectorizer_from_csv(uploaded, remove_stop_words, question_words)

# Main loop for continuous question asking
while True:
    # Get user input for the question
    user_question = input("Enter a question (or type 'exit' to quit): ")

    # Check if the user wants to exit
    if user_question.lower() == 'exit':
        print("Exiting...")
        break

    # Find the most similar questions using all similarity measures
    cosine_index, jaccard_index, tfidf_index = find_most_similar_question(user_question, count_vector_matrix, vocabulary, corpus)

    # Get the most similar questions and corresponding answers
    matched_question = compare_similarities(cosine_index, jaccard_index, tfidf_index, corpus)
    most_similar_answer = get_matching_value(matched_question, dataframe)

    print("Answer: " + most_similar_answer)




Saving Copy of eecs298_faq_bot - Modified Training.csv to Copy of eecs298_faq_bot - Modified Training (27).csv
Most Similar Question (All Methods):
Approximately, how many individuals are in the EECS 298 class?
Answer: 15


In [None]:
# Function to evaluate accuracy, precision, and recall
def evaluate_faq_bot_uploaded(test_file, count_matrix, vocabulary, corpus, dataframe):
    # Assuming 'test_file' is the uploaded CSV file
    test_data = [(row[0], row[1]) for row in test_file]

    total_questions = len(test_data)
    correct_answers = 0
    true_positives = 0
    false_positives = 0

    for user_question, correct_answer in test_data:
        cosine_index, jaccard_index, tfidf_index = find_most_similar_question(user_question, count_matrix, vocabulary, corpus)
        matched_question = compare_similarities(cosine_index, jaccard_index, tfidf_index, corpus)

        if matched_question:
            bot_answer = get_matching_value(matched_question, dataframe)
            if bot_answer.lower() == correct_answer.lower():
                correct_answers += 1
                true_positives += 1
            else:
                false_positives += 1

    accuracy = correct_answers / total_questions
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / correct_answers if correct_answers > 0 else 0

    return accuracy, precision, recall