In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import requests
from PIL import Image
import os

## This Notebook is used for creating spectrograms using library Librosa. 

This code is an integral part of a data processing workflow aimed at downloading audio files and generating their corresponding spectrograms. It begins by ensuring that two directories, **./Audio_Files** and **./Spectograms_images**, are present to store the downloaded audio files and their spectrograms. The process then involves reading a CSV file containing URLs for audio previews, which are the source for the audio samples.

A notable feature of the code is its ability to resume processing from the last completed task if interrupted, using a **last_processed_id** as a reference. This is followed by defining a function, **extract_sample_id**, which extracts the unique sample ID from the URL, crucial for subsequent steps.

The core of the script lies in the **process_sample** function. This function handles several tasks, including constructing the URL for audio download, saving the audio file, and then using librosa, an audio analysis library, to load and transform the audio into a spectrogram. The spectrogram, a visual representation of the audio's frequency spectrum, is then resized, normalized, and saved as an image.

The script methodically processes each sample in the CSV file, starting from the point immediately following the **last_processed_id**. For each sample, the script extracts the ID, processes the audio, and generates a spectrogram, efficiently managing any errors and logging the progress. Upon completing the processing of all samples, the script signals its completion.

In [None]:
# Make sure the required folders exist
os.makedirs('./Audio_Files', exist_ok=True)
os.makedirs('./Spectograms_images', exist_ok=True)

# Read the CSV file
df = pd.read_csv('/Users/sasha.cures/Documents/GitHub/group-coursework-machine-learners/data/preview_urls/preview_urls_0.csv')

# Define the last processed sample ID if process was interrupted
last_processed_id = 'd9ffc253bee00cd49f21e28dc898363866c8c327'

# Function to extract sample ID from URL
def extract_sample_id(url):
    parts = url.split('/')
    sample_id = parts[-1].split('?')[0]
    return sample_id

# Function to process each sample
def process_sample(sample_id):
    try:
        audio_url = f'https://p.scdn.co/mp3-preview/{sample_id}?cid=a7cee8cf12b141928cb92bf2570b079c'
        response = requests.get(audio_url)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to download {sample_id}")
            return

        # Download and save the audio file
        audio_file_path = f'./Audio_Files/{sample_id}.LOFI.mp3'
        with open(audio_file_path, 'wb') as file:
            file.write(response.content)

        # Load audio file with Librosa
        y, sr = librosa.load(audio_file_path, sr=None)

        # Generate a spectrogram
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

        # Resize and normalize the spectrogram
        target_size = (224, 224)
        D_resized = np.asarray(Image.fromarray(D).resize(target_size, Image.BICUBIC))
        D_resized = (D_resized - D_resized.min()) / (D_resized.max() - D_resized.min())

        # Save the resized spectrogram as an image
        image_path = f'./Spectograms_images/{sample_id}_spectrogram.png'
        plt.imsave(image_path, D_resized, cmap='gray', format='png')

        print(f"Spectrogram saved for {sample_id}")

    except Exception as e:
        print(f"Error processing {sample_id}: {e}")

# Find the index of the last processed sample
last_index = df[df['Preview_URL'].str.contains(last_processed_id)].index[0]

# Start processing from the next sample
for index, row in df.iloc[last_index + 1:].iterrows():
    sample_id = extract_sample_id(row['Preview_URL'])
    process_sample(sample_id)

print("Processing completed.")