# **Importing Necessary Libraries**

In [1]:
import os
from utils.utils import write_json, load_json
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio
import nest_asyncio
import time
import random

In [2]:
from scripts.data_extraction import SpotifyAPI, LastFMAPI
from scripts.scraper import Scraper
from scripts.feature_extraction import LanguageDetector, ZeroShotClassifier
from scripts.audio_extraction import Audio, download_audio
from scripts.text_extraction import extract_topics, compute_tfidf

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nest_asyncio.apply()

In [4]:
LASTFM_API_KEY = os.environ["LASTFM_API_KEY"]
SPOTIFY_CLIENT_ID = os.environ["SPOTIFY_CLIENT_ID"]
SPOTIFY_CLIENT_SECRET = os.environ["SPOTIFY_CLIENT_SECRET"]

# **Creating Instances**

In [5]:
songs_data = load_json("data/sample.json")

In [6]:
songs_data = load_json("data/filtered_songdata.json")

In [7]:
base_url = "https://www.azlyrics.com/lyrics/"
base_url_genius = "https://genius.com/"
sentiments = ['amusement', 'anxiety', 'angry', 'dreaminess', 'eroticism', 'feeling pumped up', 'joy', 'relaxation', 'romance', 'sadness']

In [37]:
lastfm = LastFMAPI(LASTFM_API_KEY)
spotify = SpotifyAPI(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)
spotify.get_access_token()

{'msg': 'Access Token Generated', 'validity': 3600}

In [9]:
language_detector = LanguageDetector()
classifier = ZeroShotClassifier()

In [38]:
scraper = Scraper(headless=False)


Using proxy: http://104.207.35.225:3128



# **Defining Functions**

In [11]:
def fetch_data_from_apis(id, title, artist, artist_id):
    spotify_data = spotify.fetch_song(id)
    artist_data = spotify.fetch_artist(artist_id)
    lastfm_data = lastfm.fetch_song(title, artist)

    print("\tInformation fetched from APIs....")
    
    return spotify_data, artist_data, lastfm_data

In [12]:
def audio_analysis(id):
    audio_features = spotify.fetch_audio_features(id)
    audio_analysis = spotify.fetch_audio_analysis(id)

    print("\tAudio Analysis done....")

    return audio_features, audio_analysis

In [13]:
def scrape_lyrics(title, artist):
    artist = ''.join(letter for letter in artist if (letter.isalnum() or letter.isspace()))
    title = ''.join(letter for letter in title if (letter.isalnum() or letter.isspace()))
    
    url = base_url + artist.lower().replace("-", "").replace(" ", "") + "/" + title.lower().replace(" ", "") + ".html"
    source = scraper.get_source(url)

    lyrics = scraper.get_lyrics(source)
    in_movie, movie_name = scraper.get_movie_name(source)

    if lyrics is None:
        url = base_url_genius + artist.replace(" ", "-") + "-" + title.replace(" ", "-") + "-lyrics"

        source = scraper.get_source(url)
        lyrics = scraper.get_genius_lyrics(source)

        if lyrics is None:
            print("\tLyrics not found....")
            return None, None, None

    lyrics = lyrics.replace('\n', ' ')
    print("\tLyrics scraped....")

    return lyrics, in_movie, movie_name

In [14]:
def scrape_youtube_link(url):
    source = scraper.get_source(url)
    youtube_link = scraper.get_youtube_link(source)

    print("\tYoutube link scraped....")

    return youtube_link

In [15]:
def get_language(text):
    language = language_detector.detect_language(text)

    print("\tLanguage detected....")

    return language

def get_sentiment(text, index):
    sentiment = classifier.classify(text, sentiments, multi_label=True, return_logits=True)

    return sentiment, index

In [16]:
def extract_text_features(lyrics):
    topics = extract_topics([lyrics], num_topics=3, num_words=3)
    _, tfidf = compute_tfidf([lyrics])

    print("\tText features extracted....")

    return topics, tfidf.tolist()

In [17]:
def extract_audio_features(url):
    audio_path = download_audio(url)

    print("\tAudio downloaded....")

    if audio_path is not None:
        audio = Audio(audio_path)
        melspectrogram = audio.get_melspectogram(reduce_dimensions=False).reshape(-1).tolist()

        return melspectrogram
    
    return None, None

# **Creating the Pipeline**

In [18]:
async def process_song(song, index=0):
    id = song['SongID']
    title = song['SongData']['title']
    artist = song['SongData']['artists'][0]['name']
    artist_id = song['SongData']['artists'][0]['id']
    data = {}

    print(f"{index}. {title} - {artist}")

    with ThreadPoolExecutor() as executor:
        loop = asyncio.get_event_loop()

        #! Fetch API data and scrape lyrics in parallel
        api_task = loop.run_in_executor(executor, fetch_data_from_apis, id, title, artist, artist_id)
        lyrics_task = loop.run_in_executor(executor, scrape_lyrics, title, artist)

        spotify_data, artist_data, lastfm_data = await api_task

        #! do audio analysis
        audio_analysis_task = loop.run_in_executor(executor, audio_analysis, id)

        #! Get language and textual features
        lyrics, in_movie, movie_name = await lyrics_task

        if lyrics is not None:
            text_task = loop.run_in_executor(executor, extract_text_features, lyrics)
            language_task = loop.run_in_executor(executor, get_language, lyrics)

            language = await language_task
            topics, tfidf = await text_task

            data.update({
                'in_movie': in_movie,
                'lyrics': lyrics,
                'language': language,
                'topics': topics,
                'tfidf': tfidf,
            })

        audio_features, analytic_features = await audio_analysis_task

        #! Compiling all the data
        data.update({
            "title": title, 
            "artist": artist, 
            **spotify_data, 
            **lastfm_data, 
            **artist_data, 
            **audio_features, 
            **analytic_features
        })

        data['genre'] = song['SongData']['genre']
        
        if 'song_url' in data:
            del data['song_url']
        
        print("\tCompiled data....\n\n")

        return data

In [19]:
def analyse_mood(songs, max_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(get_sentiment, songs[idx]['lyrics'], idx) for idx in range(len(songs))]

        print("Registered all the tasks....\n")

        for future in as_completed(futures):
            try:
                result = future.result()
                index = result[1]
                songs[index]['sentiment'] = result[0]

                del songs[index]['lyrics']

                print(f"{index}. {songs[index]['title']} - {songs[index]['artist']}")
                
            except Exception as e:
                print(f"An error occurred: {e}")

    return songs


In [39]:
songs = []
genres = []

for i in range(14993, len(songs_data)):
    try:
        songdata = asyncio.run(process_song(songs_data[i], i+1))

        songs.append(songdata)
        genres.append(songdata['genre'])

        if (i+1) % 100 == 0:
            prev_data = load_json("data/song_data.json")
            prev_genres = load_json("data/genres.json")
            prev_data.extend(songs)
            prev_genres.extend(genres)

            write_json("data/song_data.json", prev_data)
            write_json("data/genres.json", prev_genres)

            songs.clear()
            genres.clear()

            scraper.close()
            scraper = Scraper(headless=False)
        
        if (i+1) % 450 == 0:
            spotify.get_access_token()
            print("\nAccess token refreshed....\n")

    except Exception as e:
        prev_data = load_json("data/song_data.json")
        prev_genres = load_json("data/genres.json")
        prev_data.extend(songs)
        prev_genres.extend(genres)

        write_json("data/song_data.json", prev_data)
        write_json("data/genres.json", prev_genres)

        print(e)
        print(f"\nError occurred at index {i}....")
        break

    time.sleep(random.randint(2, 5))

scraper.close()

14846. Boy Moaning and Whimpering + Wet Noises - paincakes
	Information fetched from APIs....
	Audio Analysis done....
	Lyrics not found....
	Compiled data....


14847. She Wolf - Shakira
	Information fetched from APIs....
	Lyrics scraped....
	Text features extracted....
	Audio Analysis done....
	Language detected....
	Compiled data....


14848. Walk Em Down (feat. Roddy Ricch) - NLE Choppa
	Information fetched from APIs....
	Audio Analysis done....
	Lyrics not found....
	Compiled data....


14849. Nûsfur - Sözer Sepetçi
	Information fetched from APIs....
	Audio Analysis done....
	Lyrics not found....
	Compiled data....


14850. What's up? Pop! (feat. 初音ミク) - Capchii
	Information fetched from APIs....
	Audio Analysis done....
	Lyrics not found....
	Compiled data....



Access token refreshed....

14851. Parallel Universe - Clara Benin
	Information fetched from APIs....
	Lyrics scraped....
	Text features extracted....
	Audio Analysis done....
	Language detected....
	Compiled data....




In [None]:
songs = analyse_mood(songs)

# **Updating the JSON File**

In [None]:
write_json("data/data.json", songs)

In [None]:
write_json("data/genres.json", genres)