<a href="https://colab.research.google.com/github/tesims/myspace-themed-personal-website/blob/main/Recommendation_Systems_Similarity_Scoring_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<h2>Cosine Similarity Scoring</h2>



In [None]:
import csv
import math
from google.colab import files

def count_vectorizer_from_csv():
    # Upload the CSV file in Google Colab
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]

    # Read the CSV file
    with open(file_name, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header if present
        corpus = [row[0] for row in reader]

    # Create a vocabulary (list of unique words)
    vocabulary = list(set(word for doc in corpus for word in doc.split()))

    # Initialize a matrix with zeros
    count_matrix = []

    # Fill the matrix with word counts
    for i, doc in enumerate(corpus):
        doc_vector = [doc.split().count(word) for word in vocabulary]
        count_matrix.append(doc_vector)

    return count_matrix, vocabulary, corpus

def cosine_similarity(vector1, vector2):
    dot_product = sum(a*b for a, b in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(a**2 for a in vector1))
    magnitude2 = math.sqrt(sum(a**2 for a in vector2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)

def calculate_similarity_scores(query, count_matrix, vocabulary, corpus):
    # Vectorize the query sentence
    query_vector = [query.split().count(word) for word in vocabulary]

    # Calculate cosine similarity scores
    similarity_scores = [cosine_similarity(query_vector, doc_vector) for doc_vector in count_matrix]

    # Create a dictionary of sentences and their similarity scores
    similarity_dict = {corpus[i]: similarity_scores[i] for i in range(len(corpus))}

    return similarity_dict

def find_most_similar_question(user_question, count_matrix, vocabulary, corpus):
    # Vectorize the user question
    user_vector = [user_question.split().count(word) for word in vocabulary]

    # Calculate cosine similarity scores
    similarity_scores = [cosine_similarity(user_vector, doc_vector) for doc_vector in count_matrix]

    # Find the index of the most similar question
    most_similar_index = similarity_scores.index(max(similarity_scores))

    # Return the most similar question
    return corpus[most_similar_index]

# Apply the count vectorizer from uploaded CSV file
count_vector_matrix, vocabulary, corpus = count_vectorizer_from_csv()

# Get user input for the question
user_question = input("Enter a question: ")

# Find the most similar question
most_similar_question = find_most_similar_question(user_question, count_vector_matrix, vocabulary, corpus)

# Print the most similar question
print("\nMost Similar Question:")
print(most_similar_question)


Saving eecs298_faq_bot - Training.csv to eecs298_faq_bot - Training (3).csv
Enter a question: Where is EECS 298?

Most Similar Question:
What is EECS 298?


<h2>Jaccard and Cosine Similarity Scoring

In [None]:
import csv
import math
from google.colab import files

def count_vectorizer_from_csv():
    # Upload the CSV file in Google Colab
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]

    # Read the CSV file
    with open(file_name, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header if present
        corpus = [row[0] for row in reader]

    # Create a vocabulary (list of unique words)
    vocabulary = list(set(word for doc in corpus for word in doc.split()))

    # Initialize a matrix with zeros
    count_matrix = []

    # Fill the matrix with word counts
    for i, doc in enumerate(corpus):
        doc_vector = [doc.split().count(word) for word in vocabulary]
        count_matrix.append(doc_vector)

    return count_matrix, vocabulary, corpus

# Calculated using the angle between the two vectors being compared
def cosine_similarity(vector1, vector2):
    dot_product = sum(a*b for a, b in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(a**2 for a in vector1))
    magnitude2 = math.sqrt(sum(a**2 for a in vector2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)

# Defined as the size of the intersection divided by the size of the union of two sets.
# Suitable for cases where the order of elements is not important.
# Particularly used for comparing sets.
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    else:
        return intersection / union

def calculate_similarity_scores(query, count_matrix, vocabulary, corpus):
    # Vectorize the query sentence
    query_vector = [query.split().count(word) for word in vocabulary]

    # Calculate cosine similarity scores
    cosine_similarity_scores = [cosine_similarity(query_vector, doc_vector) for doc_vector in count_matrix]

    # Calculate Jaccard similarity scores
    query_set = set(query.split())
    jaccard_similarity_scores = [jaccard_similarity(query_set, set(doc.split())) for doc in corpus]

    # Create dictionaries of sentences and their similarity scores
    cosine_similarity_dict = {corpus[i]: cosine_similarity_scores[i] for i in range(len(corpus))}
    jaccard_similarity_dict = {corpus[i]: jaccard_similarity_scores[i] for i in range(len(corpus))}

    return cosine_similarity_dict, jaccard_similarity_dict

def find_most_similar_question(user_question, count_matrix, vocabulary, corpus):
    # Vectorize the user question
    user_vector = [user_question.split().count(word) for word in vocabulary]

    # Calculate cosine similarity scores
    cosine_similarity_scores = [cosine_similarity(user_vector, doc_vector) for doc_vector in count_matrix]

    # Calculate Jaccard similarity scores
    user_set = set(user_question.split())
    jaccard_similarity_scores = [jaccard_similarity(user_set, set(doc.split())) for doc in corpus]

    # Find the indices of the most similar questions
    most_similar_cosine_index = cosine_similarity_scores.index(max(cosine_similarity_scores))
    most_similar_jaccard_index = jaccard_similarity_scores.index(max(jaccard_similarity_scores))

    # Return the most similar questions for both similarity measures
    return corpus[most_similar_cosine_index], corpus[most_similar_jaccard_index]

# Apply the count vectorizer from uploaded CSV file
count_vector_matrix, vocabulary, corpus = count_vectorizer_from_csv()

# Get user input for the question
user_question = input("Enter a question: ")

# Find the most similar questions using both similarity measures
most_similar_cosine, most_similar_jaccard = find_most_similar_question(user_question, count_vector_matrix, vocabulary, corpus)

# Print the most similar questions
print("\nMost Similar Question (Cosine Similarity):")
print(most_similar_cosine)

print("\nMost Similar Question (Jaccard Similarity):")
print(most_similar_jaccard)


Saving eecs298_faq_bot - Training.csv to eecs298_faq_bot - Training (4).csv
Enter a question: Where is eecs 298?

Most Similar Question (Cosine Similarity):
What is EECS 298?

Most Similar Question (Jaccard Similarity):
What is EECS 298?
