# **Importing Necessary Libraries**

In [1]:
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
from mittens import GloVe
from sklearn.preprocessing import MinMaxScaler
import re

In [2]:
from utils.utils import load_json, write_json
from utils.similarity_score import cosine

In [3]:
songs = load_json('data/song_data.json')

In [4]:
genres = load_json('data/genres.json')

# **Global Variables**

In [5]:
languages = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']

In [6]:
genre_vector_size = 20

# **Create Embeddings**

1. album type - `ohe`
2. explicit - `ohe`
3. genre = `GloVe`
4. artist name, followers, genres, popularity - `GloVe and MinMaxScaler`
5. in movie - `ohe`
6. language - `ohe`
7. topics - `model`
8. lyrics - `model`

### **GloVe Algorithm**

In [7]:
unique_genres = set()

for genre in genres:
    arr = set(genre)
    if arr.issubset(unique_genres):
        continue
    
    unique_genres.update(arr)

In [8]:
genre_to_index = {genre: i for i, genre in enumerate(unique_genres)}
index_to_genre = {i: genre for genre, i in genre_to_index.items()}

In [9]:
matrix = np.zeros((len(unique_genres), len(unique_genres)), dtype=int)

for genre in genres:
    for genre1, genre2 in combinations(genre, 2):
        idx1 = genre_to_index[genre1]
        idx2 = genre_to_index[genre2]
        matrix[idx1, idx2] += 1
        matrix[idx2, idx1] += 1

In [12]:
model = GloVe(n=genre_vector_size, learning_rate=0.1, max_iter=1000)
data = model.fit(matrix)

Iteration 1000: error 3.8948

In [11]:
data.shape

(2650, 20)

In [13]:
def genre_embeddings(genres):
    arr = []
    for genre in genres:
        try:
            idx = genre_to_index[genre]
            arr.append(data[idx])
        except:
            continue

    if len(arr) == 0:
        return np.zeros(genre_vector_size).tolist()
    
    return np.mean(arr, axis=0).tolist()

### **Scaler**

In [14]:
follower_scaler = MinMaxScaler()
popularity_scaler = MinMaxScaler()

In [15]:
followers = [song['artist_followers'] for song in songs]
popularity = [song['artist_popularity'] for song in songs]

In [16]:
scaled_followers = follower_scaler.fit_transform(np.expand_dims(followers, axis=1))

In [17]:
scaled_popularity = popularity_scaler.fit_transform(np.expand_dims(popularity, axis=1))

### **One Hot Encoding**

In [18]:
for i in range(len(songs)):
    if 'album_type' in songs[i]:
        songs[i]['album_type'] = 0 if songs[i]['album_type'] == 'single' else 1

    if 'explicit' in songs[i]:
        songs[i]['explicit'] = 0 if songs[i]['explicit'] == False else 1

    if 'in_movie' in songs[i]:
        songs[i]['in_movie'] = 0 if songs[i]['in_movie'] == False else 1
    else:
        songs[i]['in_movie'] = 0

    if 'language' in songs[i]:
        arr = np.zeros_like(languages, dtype=int)
        for lang in songs[i]['language']:
            idx = languages.index(lang)
            arr[idx] = 1
    
        songs[i]['language'] = arr.tolist()

    songs[i]['genre'] = genre_embeddings(songs[i]['genre'])

    songs[i]['artist_vector'] = genre_embeddings(songs[i]['artist_genre'])
    songs[i]['artist_vector'].extend([scaled_popularity[i][0], scaled_followers[i][0]])

    del songs[i]['artist_genre'], songs[i]['artist_popularity'], songs[i]['artist_followers']

    if 'tfidf' in songs[i]:
        del songs[i]['tfidf']

    print(f'{i + 1}. {songs[i]["title"]} - {songs[i]["artist"]}')

    

1. Mr. Brightside - The Killers
2. Achei Que Fosse Fácil - Bala na Agulha
3. En Solskinnsdag - Postgirobygget
4. I Kissed A Girl - Katy Perry
5. Dias De Luta, Dias De Gloria - Charlie Brown Jr.
6. Agora Estou Sofrendo - Ao Vivo - Calcinha Preta
7. Berlin City Girl - Culcha Candela
8. Clouds - Paper Idol
9. Unconditionally - Katy Perry
10. Teeth Grinding [Mixed] - Noize Suppressor
11. Kitne Bhi Tu Karle Sitam - Sanam Teri Kasam / Soundtrack Version - Kishore Kumar
12. One Right Now (with The Weeknd) - Post Malone
13. Guten Tag - Kronkel Dom
14. Contemporary Ambiance for Dogs and Calming - Dog Music
15. глобальный вайп - mzlff
16. Car Keys (Ayla) - Alok
17. Tovtatis - Eluveitie
18. Motherfucker - Dwarves
19. Gato Cerveja - kamaitachi
20. Relaxing Songs for Keen Doggie - Pet Music by Lullify
21. Lose You - Sam Smith
22. Klub go go - Gang Albanii
23. Euphoria - Luca
24. IN MY REMAINS - Linkin Park
25. Sk8er Boi - Avril Lavigne
26. Secrets - The Weeknd
27. Still (I Got Summer On My Mind) - 

# **Save the Embeddings**

In [19]:
write_json('data/vectors.json', songs)