In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
import yaml
from sklearn.metrics.pairwise import cosine_similarity

import autoencoder

## Representation learning

Train autoencoder and obtain the embeddings.

In [7]:
autoencoder.representation_learning()

Autoencoder(
  (encoder0): Linear(in_features=1425, out_features=1024, bias=True)
  (encoder1): Linear(in_features=1024, out_features=1024, bias=True)
  (encoder2): Linear(in_features=1024, out_features=768, bias=True)
  (encoder22): Linear(in_features=768, out_features=512, bias=True)
  (encoder3): Linear(in_features=512, out_features=256, bias=True)
  (decoder0): Linear(in_features=256, out_features=512, bias=True)
  (decoder11): Linear(in_features=512, out_features=768, bias=True)
  (decoder1): Linear(in_features=768, out_features=1024, bias=True)
  (decoder2): Linear(in_features=1024, out_features=1024, bias=True)
  (decoder3): Linear(in_features=1024, out_features=1425, bias=True)
  (activation): Tanh()
)
Epoch 100/600, Loss: 0.3514698858444507
Epoch 200/600, Loss: 0.24097653593008334
Epoch 300/600, Loss: 0.20460479190716377
Epoch 400/600, Loss: 0.19185469299554825
Epoch 500/600, Loss: 0.1866001716026893
Epoch 600/600, Loss: 0.18378102492827636


To find the **5 most similar** video clips to a random one in the training set, you can follow these steps:

<ol>
<li>Select a random video clip from the training set.</li>
<li>Calculate the similarity between the embeddings of the selected video clip and all the other video clips using a similarity metric such as cosine similarity.</li>
<li>Sort the video clips based on their similarity to the selected video clip in descending order.</li>
<li>Retrieve the 5 most similar video clips from the sorted list.</li>
</ol>

In [8]:
 with open('./config.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

EMBEDDINGS_PATH = config["EMBEDDINGS_PATH"]

embeddings = pd.read_csv(EMBEDDINGS_PATH)

In [9]:
new_row = embeddings.iloc[84, 1:]

# Calculate cosine similarity
cos_sim = cosine_similarity(embeddings.iloc[:, 1:].values, new_row.values.reshape(1, -1))

# Create a dataframe with cosine similarity values and row indices
similarity_df = pd.DataFrame({'similarity': cos_sim.flatten(), 'index': embeddings.index})

# Sort dataframe by similarity in descending order
sorted_df = similarity_df.sort_values(by='similarity', ascending=False)

# Get the top k rows
k = 5  # Replace with your desired value of k
top_k_rows = embeddings.loc[sorted_df['index'].head(k)]

top_k_rows.song_name

84                      Leftöver Crack - Rock The 40 oz.
62     Bloodhound Gang - Fire Water Burn (Official Vi...
308                              The Wild! - Livin' Free
722    Black Stone Cherry - Cheaper To Drink Alone (O...
357                    The Struts - Put Your Money On Me
Name: song_name, dtype: object