<a href="https://colab.research.google.com/github/theperiperi/Semantic-Search-Engine-for-Music/blob/main/reference_for_audio_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Building an Audio Recommendation System

In [None]:
%%bash
pip install pandas
pip install -U openai-whisper
pip install pytube
pip install numpy
pip install pinecone-client

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 798.6/798.6 kB 6.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 17.3 MB/s eta 0:00:00
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml): started
  Building wheel for openai-whisper (pyproject.toml): finished with status 'done'
  Created wheel for openai-whisper: filename=openai_whisper-20231117-py3-none-any.whl size=

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires openai, which is not installed.


In [None]:
# Import the modules
import os
import torch
import whisper
import pinecone
import numpy as np
import pandas as pd
from pytube import YouTube

  from tqdm.autonotebook import tqdm


In [None]:
def video_to_audio(video_url, destination):

    # Get the video
    video = YouTube(video_url)

    # Convert video to Audio
    audio = video.streams.filter(only_audio=True).first()

    # Save to destination
    output = audio.download(output_path = destination)

    name, ext = os.path.splitext(output)
    new_file = name + '.mp3'

    # Replace spaces with "_"
    new_file = new_file.replace(" ", "_")

    # Change the name of the file
    os.rename(output, new_file)

    return new_file

In [None]:
%%bash
mkdir "audio_data"

In [None]:
# Create URL column
audio_path = "audio_data"

list_videos = [...]# paste comma seperated yt links here
# Create dataframe
transcription_df = pd.DataFrame(list_videos, columns=['URLs'])

In [None]:
transcription_df.head()

Unnamed: 0,URLs
0,https://www.youtube.com/watch?v=IdTMDpizis8
1,https://www.youtube.com/watch?v=fLeJJPxua3E
2,https://www.youtube.com/watch?v=z3FA2kALScU
3,https://www.youtube.com/watch?v=yBrRpb8aLwk
4,https://www.youtube.com/watch?v=ERClHCOF14c


In [None]:
transcription_df["file_name"] = transcription_df["URLs"].apply(lambda url: video_to_audio(url, audio_path))
transcription_df.head()

## Transcription

In [None]:
# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model
whisper_model = whisper.load_model("large", device=device)

100%|██████████████████████████████████████| 2.88G/2.88G [00:26<00:00, 117MiB/s]


In [None]:
def audio_to_text(audio_file):

    return whisper_model.transcribe(audio_file)["text"]

In [None]:
# Apply the function to all the audio files
transcription_df["transcriptions"] = transcription_df["file_name"].apply(lambda f_name: audio_to_text(f_name))
# Show the first five rows
transcription_df.head()

In [None]:
import textwrap

In [None]:
wrapper = textwrap.TextWrapper(width=60)
first_transcription = transcription_df.iloc[0]["transcriptions"]
formatted_transcription = wrapper.fill(text=first_transcription)

# Check first transcription
print(formatted_transcription)

In [None]:
transcription_df = transcription_df.drop(["transcription_length"], axis=1)

In [None]:
transcription_df.head()

## Generation of Transcripts' Embeddings

In [None]:
!pip install openai
import openai

In [None]:
# Set up the OpenAI key
openai.api_key = ""#your openAPI key here

In [None]:
def get_embeddings(text_to_embed):

	response = openai.Embedding.create(
    	model= "text-embedding-ada-002",
    	input=[text_to_embed]
	)

	# Extract the AI output embedding as a list of floats
	embedding = response["data"][0]["embedding"]

	return embedding

In [None]:
transcription_df["embedding"] = transcription_df["transcriptions"].astype(str).apply(get_embeddings)

In [None]:
transcription_df.head()

In [None]:
vector_dim = transcription_df.iloc[0].embedding
len(vector_dim)

## Configure your environment

In [None]:
# find API key in console at app.pinecone.io
api_key = "" #pinecone db key
# find ENV (cloud region) next to API key in console
env = "gcp-starter"

# Initialize connection to pinecone
pinecone.init(
  api_key=api_key,
  environment=env
)

# Index params
my_index_name = "audio-search"
vector_dim = len(transcription_df.iloc[0].embedding)

if my_index_name not in pinecone.list_indexes():
  # Create the index
  pinecone.create_index(name = my_index_name,
                      dimension=vector_dim,
                      metric="cosine", shards=1,
                      pod_type='s1.x1')
# Connect to the index
my_index = pinecone.Index(index_name = my_index_name)

In [None]:
# Show information about the vector index
my_index.describe_index_stats()

## Populate the the Pinecone Vector Index

In [None]:
transcription_df["vector_id"] = transcription_df.index
transcription_df["vector_id"] = transcription_df["vector_id"].apply(str)

# Get all the metadata
final_metadata = []

for index in range(len(transcription_df)):
  final_metadata.append({
      'ID':  index,
      'url': transcription_df.iloc[index].URLs,
      'transcription': transcription_df.iloc[index].transcriptions
  })

audio_IDs = transcription_df.vector_id.tolist()
audio_embeddings = [arr for arr in transcription_df.embedding]

# Create the single list of dictionary format to insert
data_to_upsert = list(zip(audio_IDs, audio_embeddings, final_metadata))

# Upload the final data
my_index.upsert(vectors = data_to_upsert)

# Show information about the vector index
my_index.describe_index_stats()

In [None]:
N = 3
my_query_embedding = transcription_df.embedding[0]

# Run the Query Search
my_index.query(my_query_embedding, top_k=N, include_metadata=True)