# Tutorial 1 - SentenceBert
Import all necessary packages

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import warnings
import numpy as np
warnings.filterwarnings('ignore')

## Load the Sentence Transformer model

In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding[:5])
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-0.01371735 -0.04285156 -0.01562862  0.0140538   0.03955379]

Sentence: Sentences are passed as a list of string.
Embedding: [ 0.05645249  0.0550024   0.0313796   0.03394846 -0.03542465]

Sentence: The quick brown fox jumps over the lazy dog.
Embedding: [0.04393354 0.05893435 0.04817838 0.07754801 0.02674439]



## The Amazon Reviews dataset 
The Amazon Reviews dataset consists of three columns:  
<ul>
<li> Polarity: This is the sentiment of the review. It is a binary variable with '0' indicating a negative sentiment and '1' indicating a positive sentiment.  </li>
<li> Title: This is the title of the review. It is a string that summarizes the reviewer's experience in a few words.  </li>
<li> Text: This is the main body of the review. It contains the detailed opinion of the reviewer about the product.</li> 
</ul>

The dataset provides valuable insights into customer opinions and sentiments towards products. It can be used for various tasks such as sentiment analysis, text classification, and customer behavior prediction.<hr></hr>

In [3]:
# Load the dataset
df = pd.read_csv('archive/train.csv', header=None, names=("polarity", "title", "text"))
# Convert labels to '0' and '1'
df.polarity = df.polarity.map({1: 0, 2: 1})
# Display the first 5 rows of the dataset
df.head(5)

Unnamed: 0,polarity,title,text
0,1,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,1,Amazing!,This soundtrack is my favorite music of all ti...
3,1,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,1,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


# We will create some helper functions to find neighbors based on euclidean distance

In [4]:
def create_cosine_similarity_matrix(vectors_1, vectors_2):
    """
    This function computes the pairwise cosine similarity between vectors in a matrix.
    Args:
        vectors_1: A 2D numpy array where each row represents a vector.
        vectors_2: A 2D numpy array where each row represents a vector.
    Returns:
         A 2D numpy array where the element at position (i, j) represents the cosine similarity between vectors[i] and vectors[j].
    """
    
    # Normalize the vectors
    vectors_1_norm = vectors_1 / np.linalg.norm(vectors_1, axis=1, keepdims=True)
    vectors_2_norm = vectors_2 / np.linalg.norm(vectors_2, axis=1, keepdims=True)
    
    # Compute cosine similarity
    cosine_similarity_matrix = np.dot(vectors_1_norm, vectors_2_norm.T)
    
    return cosine_similarity_matrix

In [5]:
def find_nearest_neighbors(embeddings_1, embeddings_2, k=3):
    """
    This function finds the k-nearest neighbors for each vector in a matrix.
    Args:
        embeddings_1: A 2D numpy array where each row represents a vector.
        embeddings_2: A 2D numpy array where each row represents a vector.
        k: The number of nearest neighbors to find.
    Returns:
        A 2D numpy array where the element at position (i, j) represents the index of the j-th nearest neighbor of embeddings_1[i] in embeddings_2.
    """
    # Compute cosine similarity matrix
    cosine_similarity_matrix = create_cosine_similarity_matrix(embeddings_1, embeddings_2)
    
    # Find the indices of the k-nearest neighbors
    nearest_neighbors_indices = np.argsort(-cosine_similarity_matrix, axis=1)[:, 1:k+1]
    
    return nearest_neighbors_indices

Embedd both positive and negative reviews

In [6]:
%%time
# Filter only positive and negative reviews
positive_reviews = df[df.polarity == 1].text.tolist()[:500]
negative_reviews = df[df.polarity == 0].text.tolist()[:500]
all_reviews = positive_reviews + negative_reviews

# Embedd the reviews
positive_embeddings = model.encode(positive_reviews)
negative_embeddings = model.encode(negative_reviews)
all_embeddings = model.encode(all_reviews)

Let us see an example of semantic similarity, we embedd the positive reviews and find the closest reviews to each review. <br>

The closest reviews are the reviews that have the smallest distance to the review in the embedding space. <br>

The distance is computed using the euclidean distance between the embeddings.<br> 

The closest reviews are likely to have similar content or sentiment to the review.

In [14]:
closest_neighbors = find_nearest_neighbors(positive_embeddings, positive_embeddings, k=2)

for i, neighbors in enumerate(closest_neighbors):
    print("Review:\n\t", positive_reviews[i])
    print()
    print("Closest reviews:")
    print()
    for j, neighbor in enumerate(neighbors):
        print("\t", positive_reviews[neighbor])
        print("-----------------------------------------------")
        print()
    print("##########################################################################################")
    break


Review:
	 This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

Closest reviews:

	 I truly like this soundtrack and I enjoy video game music. I have played this game and most of the music on here I enjoy and it's truly relaxing and peaceful.On disk one. my favorites are Scars Of Time, Between Life and Death, Forest Of Illusion, Fortress of Ancient Dragons, Lost Fragment, and Drowned Valley.Disk Two: The Draggons, Galdorb - Home, Chronomantique, Prisoners of Fate, Gale, and my girlfriend likes ZelbessDisk Three: The best of the three. Garden Of God, Chronopolis, Fates, Jellyfish sea, Burning Orphange, Dragon's Prayer, Tower Of Stars, Dragon God, and 

Now let's see what happens when we try to find reviews that are similar to a new review written be me

In [8]:
new_review = "I love my new desk! very sturdy amd reliable. I would recommend it to anyone."
new_embedding = model.encode([new_review])

In [9]:
new_closest_neighbors = find_nearest_neighbors(new_embedding, all_embeddings)

for i, neighbors in enumerate(new_closest_neighbors):
    print("Review:", new_review)
    print("Closest reviews:")
    for j, neighbor in enumerate(neighbors):
        print("\t", all_reviews[neighbor])
        print("\n-----------------------------------------------\n")
        

Review: I love my new desk! very sturdy amd reliable. I would recommend it to anyone.
Closest reviews:
	 I was very pleased with this desk and chair- Was able to assemble it fairly easy and now my daughter is enjoying it very much- The desk is quite sturdy and the chair is strong- The delivery of my product was expedited quickly- Thank you very much

-----------------------------------------------

	 We bought this desk for my seven year old daughter to keep her lap top on and to do homework. It was not to disappoint. The desk is perfect size for her to sit and work. Very high quality and easy to put together. I would highly recommend this product to others?

-----------------------------------------------

	 This desk is definitely worth the money. My daughter wanted a white desk with a hutch and had picked out many over [...] not including the chair. I ordered this one without her knowing and at first she was upset because the hutch is not that big. But once it was assembled she love

## Sentiment Analysis using Sentence Transformers
We can use the sentence embeddings and their proximity to one another in euclidean space to perform sentiment analysis. We can use the embeddings to train a classifier that can predict the sentiment of a review based on the sentence embeddings. We can use the embeddings as features and the sentiment labels as the target variable. We can then train a classifier such as K-Nearest Neighbors to predict the sentiment of a new review based on its embedding. 

 Query the first 1000 reviews and their labels

In [10]:
embeddings = model.encode(df.text.tolist()[:1000])
labels = df.polarity.to_numpy()[:1000]

Train a KNN calssifier to

In [11]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print("Model accuracy: ", knn.score(X_test, y_test))

Model accuracy:  0.745


## Comparing Sentence Transformers to each other
One thing to notice, is that sentence transformers usually don't work well together. This is because they are usually trained on different datasets and have different architectures/objectives. This means that the embeddings they generate are not directly comparable. 

In [12]:
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

sentences_to_compare = ["When I watch Hamilton my heart melts", "I love watching Hamilton"]
model_1 = SentenceTransformer('sentence-transformers/LaBSE')
model_2 = SentenceTransformer("sentence-transformers/sentence-t5-base")

embeddings_1 = model_1.encode(sentences_to_compare)
embeddings_2 = model_2.encode(sentences_to_compare)

print("Similarity between the sentences using only model 1:", (embeddings_1[0] @ embeddings_1[1]) / norm(embeddings_1[0]) * norm(embeddings_1[1]))
print("Similarity between the sentences using only model 2:", (embeddings_2[0] @ embeddings_2[1]) / norm(embeddings_2[0]) * norm(embeddings_2[1]))
print("")
print("Similarity between the sentences using model 1 to model 2 :", (embeddings_1[0] @ embeddings_2[0]) / norm(embeddings_1[0]) * norm(embeddings_2[0]))


Similarity between the sentences using only model 1: 0.7395603
Similarity between the sentences using only model 2: 0.9445546

Similarity between the sentences using model 1 to model 2 : 0.03902055
