# Visualizing the embeddings in 2D by t-SNE
We will use t-SNE to reduce the dimensionality of the embeddings from 1536 to 2. Once the embeddings are reduced to two dimensions, we can plot them in a 2D scatter plot. 

In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
from ast import literal_eval

# Load the embeddings
#datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv"
datafile_path = "data/amazon_review_with_embeddings_1k.csv"
df = pd.read_csv(datafile_path)

# Convert to a list of lists of floats
matrix = np.array(df.embedding.apply(literal_eval).to_list())

# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, random_state=42)
vis_dims = tsne.fit_transform(matrix)
vis_dims.shape

# Plotting the embeddings

We colour each review by its star rating, ranging from red to green.

We can observe a decent data separation even in the reduced 2 dimensions.

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

colors = ["red", "orange", "yellow", "green", "blue"]
x = [x for x, y in vis_dims]
y = [y for x, y in vis_dims]
color_indices = df.Score.values - 1

colormap = matplotlib.colors.ListedColormap(colors)
plt.figure(figsize=(8, 8))
plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3, s=30)

plt.title("Amazon ratings visualized in language using t-SNE")

# Get the limits for both axes
x_min, x_max = min(x), max(x)
y_min, y_max = min(y), max(y)

# Determine the larger range
x_range = x_max - x_min
y_range = y_max - y_min
max_range = max(x_range, y_range)

# Set the limits
x_center = (x_max + x_min) / 2
y_center = (y_max + y_min) / 2

plt.xlim(x_center - max_range / 2 * 1.2, x_center + max_range / 2 * 1.2)
plt.ylim(y_center - max_range / 2 * 1.2, y_center + max_range / 2 * 1.2)

plt.gca().set_aspect('equal', adjustable='box')
plt.show()


# 高次元空間におけるembeddingsベクトル間の距離と２Dマップ上での距離との比較

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt

# Calculate the pairwise Euclidean distances in the original space
original_distances = pdist(matrix, 'euclidean')
original_distances_square = squareform(original_distances)

# Calculate the pairwise Euclidean distances in the reduced space
reduced_distances = pdist(vis_dims, 'euclidean')
reduced_distances_square = squareform(reduced_distances)

# Flatten the distance matrices for plotting
original_distances_flat = original_distances_square.flatten()
reduced_distances_flat = reduced_distances_square.flatten()

# Plot the original distances vs. the reduced distances
plt.figure(figsize=(4, 4))
plt.scatter(original_distances_flat, reduced_distances_flat, alpha=0.3, s=0.00005)
plt.xlabel('Original Euclidean Distances')
plt.ylabel('Reduced Euclidean Distances')
plt.xlim(0.9,1.3)
plt.ylim(0,110)
plt.title('Comparison of Distances Before and After Dimensionality Reduction')
plt.grid(True)
plt.show()
