# **Importing Necessary Libraries**

In [1]:
from transformers import XLMRobertaModel, XLMRobertaTokenizer, BertTokenizer, BertModel
import torch
import re
import json
import numpy as np

In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# **Loading Data**

In [17]:
file_path = '/content/drive/MyDrive/Song Recommendation System/vectors.json'

with open(file_path, 'r') as f:
        data = f.read()
        songs =  json.loads(data)

In [4]:
model_name = 'xlm-roberta-large'

tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [5]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
model.to(DEVICE)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwi

In [7]:
bert_model.to(DEVICE)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

# **Defining Functions**

In [8]:
def format_text(text):
    text = re.sub(r'\[.*?\]', ' \n\n ', text)

    while True:
        res = re.search('[a-z][A-Z]', text)
        if res is None:
            break
        text = text[:res.start() + 1] + ' \n ' + text[res.start() + 1:]

    return text.strip()

In [9]:
def flatten_text(arr):
    return ' '.join(arr)

In [10]:
def tokenize_texts(texts, tokenizer, max_length=512):
    inputs = tokenizer(
        texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    inputs = {key: value.to(DEVICE) for key, value in inputs.items()}

    return inputs

In [11]:
def get_topic_vector(topic, tokenizer, model):
    inputs = tokenizer(topic, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: value.to(DEVICE) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Use the mean of the token embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)

    return embeddings.squeeze().cpu().tolist()

In [12]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# **Processing Data**

In [18]:
topic_count = 0

for i in range(len(songs)):
    if 'topics' not in songs[i]:
      continue

    topics = [flatten_text(topic) for topic in songs[i]['topics']]

    vector = [get_topic_vector(topic, bert_tokenizer, bert_model) for topic in topics]
    vector = np.array(vector).mean(axis=0)

    songs[i]['topics'] = vector.tolist()

    topic_count += 1

    print(f'{i+1}. {songs[i]["title"]} - {songs[i]["artist"]}')

1. Mr. Brightside - The Killers
3. En Solskinnsdag - Postgirobygget
4. I Kissed A Girl - Katy Perry
5. Dias De Luta, Dias De Gloria - Charlie Brown Jr.
7. Berlin City Girl - Culcha Candela
8. Clouds - Paper Idol
9. Unconditionally - Katy Perry
13. Guten Tag - Kronkel Dom
16. Car Keys (Ayla) - Alok
17. Tovtatis - Eluveitie
18. Motherfucker - Dwarves
19. Gato Cerveja - kamaitachi
21. Lose You - Sam Smith
22. Klub go go - Gang Albanii
24. IN MY REMAINS - Linkin Park
25. Sk8er Boi - Avril Lavigne
26. Secrets - The Weeknd
28. Corsa Freestyle - Ryu, the Runner
30. Bassthoven - Kyle Exum
32. Hellfire - Barns Courtney
34. unravel (acoustic version) - TK from Ling tosite sigure
43. I Always Feel Like... - TRU
45. Changes - 2Pac
50. Virus - PapaBoyz
51. Saturnus - Morabeza Tobacco
53. Nun id change - Yeat
55. Repeat - Al James
56. Love, Peace & Happiness - Marteria
57. Katakan Saja - Khifnu
59. Fiu Fiu - Kizo
60. Eins Zwei Polizei - Mo-Do
62. scars - Novulent
63. Break My Heart - Dua Lipa
64. Fe

  vector = np.array(vector).mean(axis=0)
  ret = ret.dtype.type(ret / rcount)


3899. Into The Fire - Asking Alexandria
3900. EU TE AVISEI - MC Cabelinho
3909. Motivasyon - Ben Fero
3912. Bumbum de Ouro - Gloria Groove
3915. Tattoo - Loreen
3922. Dynamite - Taio Cruz
3923. When I Come Around - Green Day
3927. Duduke - Simi
3928. Latch - Disclosure
3933. Praja Magia - Nu Genea
3935. Dance Macabre - Ghost
3942. Samurai - Cairokee
3943. MMA - Azis
3944. El Patito Juan - Biper Y Sus Amigos
3946. Popular Loner - Rod Wave
3948. Buy Myself A Chance - Randy Rogers Band
3956. With All My Heart - ILLENIUM
3957. Get Money Catch Bodies - Big Moochie Grape
3962. HVA SA DU - Den BB
3963. Deep Wounds - Polo G
3965. Boom Boom Boom Boom Boom Boom Boom Boom Boom Boom Boom Boom Boom - Dan Bull
3970. J.C.V.D - Mc Kresha
3973. RUMBLE - DUSTY LOCANE
3974. Black Diamond - Stratovarius
3975. Little Wing - Jimi Hendrix
3976. Vindicated - Dashboard Confessional
3981. Like That - Fox Stevenson
3982. Nice & Good - Knucks
3985. The Lazy Song - Bruno Mars
3988. I'm Yours - Jason Mraz
3991. 3 A

In [20]:
print(f'Songs with with topics: {topic_count}')

Songs with with topics: 5294


In [21]:
count = 0

for i in range(len(songs)):
    if 'lyrics' not in songs[i]:
      continue

    lyrics = format_text(songs[i]['lyrics'])
    inputs = tokenize_texts(lyrics, tokenizer)

    with torch.no_grad():
        outputs = model(**inputs)
        vector = mean_pooling(outputs, inputs['attention_mask'])

    songs[i]['lyrics'] = vector.squeeze().cpu().tolist()

    count += 1

    print(f'{i+1}. {songs[i]["title"]} - {songs[i]["artist"]}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
692. Nogi Mammona - Ogur damex
693. Just A Cloud Away - Pharrell Williams
694. El Diablo - Elena Tsagrinou
695. Break My Heart Again - FINNEAS
702. Yesterday - Kenny G
706. Envole-moi - Jean-Jacques Goldman
707. This Cocaine Makes Me Feel Like I'm On This Song - System Of A Down
710. Good Girl - Kiyashqo
711. TQG - KAROL G
714. Urke - Wilki
715. Ruf nicht an - Trippie Boi
717. Don't Tell 'Em - Jeremih
718. BMW - Bad Boy Chiller Crew
719. Rock Me Amadeus - Falco
720. Judecata - Ciobo
721. She's So Nice - Pink Guy
727. Square Nothing - In Flames
728. One More Time - Daft Punk
730. girlfriend - hemlocke springs
734. Still Got The Blues - Gary Moore
735. The Glutton Of Sympathy - Jellyfish
743. Love In The Dark - Adele
744. Cruel - Jeff Bernat
745. From Yesterday - Thirty Seconds To Mars
749. Take Me or Leave Me - Idina Menzel
757. Alec Eiffel - Pixies
759. Chammak Challo - Akon
761. Stealing Society - System Of A Down
765. S

In [22]:
print(f'Songs with with lyrics: {count}')

Songs with with lyrics: 5294


# **Saving Data**

In [23]:
file_path = '/content/drive/MyDrive/Song Recommendation System/vector_data.json'

with open(file_path, 'w') as f:
        json.dump(songs, f, indent=4)