In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Dropout, LSTM
from sklearn.model_selection import train_test_split
import librosa
import numpy as np
import os

# Function to load and preprocess images
def load_and_preprocess_image(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    return img_array/255.0

def load_and_preprocess_audio(audio_path, max_audio_length):
    audio_data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)

    # Ensure the audio has the desired length
    if len(audio_data) < max_audio_length:
        # If too short, pad with zeros
        audio_data = np.pad(audio_data, (0, max_audio_length - len(audio_data)))
    else:
        audio_data = audio_data[:max_audio_length]
    mfccs = librosa.feature.mfcc(y=audio_data, sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)

    return mfccs



In [2]:
import librosa.util

# Constants
SAMPLE_RATE = 5000
TRACK_DURATION = 30
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

num_mfcc = 13
n_fft = 1024
hop_length = 256
num_segments = 15

# Calculate the maximum audio length in samples
max_audio_length = SAMPLES_PER_TRACK * num_segments

In [3]:
import random
# Function to create combinations of image and audio data
def create_data_combinations(image_folder, audio_folder, max_audio_length):
    images,labels,voices=[],[],[]
    class_mapping = {
    'disco': 0,
    'metal': 1,
    'reggae': 2,
    'blues': 3,
    'rock': 4,
    'classical': 5,
    'jazz': 6,
    'hiphop': 7,
    'country': 8,
    'pop': 9
    }

    for class_folder in os.listdir(image_folder):
        class_path = os.path.join(image_folder, class_folder)
        i=0
        for image_name in os.listdir(class_path):
            image_path = os.path.join(class_path, image_name)
            audio_path = os.path.join(audio_folder,class_folder, class_folder)
            # Load and preprocess image and audio data
            img_data = load_and_preprocess_image(image_path)
            audio_path = os.path.join(audio_folder, class_folder)
            audio_files = os.listdir(audio_path)
            selected_audio_files = random.sample(audio_files, min(5, len(audio_files)))
            for audio in selected_audio_files:
                try:
                    data_path=os.path.join(audio_path,audio)
                    audio_data = load_and_preprocess_audio(data_path, max_audio_length)
                    images.append(img_data)
                    voices.append(audio_data)
                    label = class_mapping[class_folder]
                    labels.append(label)
                except:
                    continue
            if i==60:
                break
            i=i+1
        print(class_folder)

    return images,voices,labels



In [4]:
import os
os.chdir("/kaggle/input/music-recomend-data/music_recomendation_Data/music_recomendation_Data")
    # Example: Create data combinations from image and audio folders
image_folder = 'images_original'
audio_folder = 'genres_original'
images,voices,labels = create_data_combinations(image_folder, audio_folder,max_audio_length)

disco
metal
reggae
blues
rock
classical


  audio_data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio_data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


jazz
hiphop
country
pop


In [5]:
# Split data into features and labels
X_image = np.array(images)
X_audio = np.array(voices)
Y_labels = np.array(labels)

In [6]:
X_audio.shape,X_image.shape,Y_labels.shape

((3048, 13, 8790), (3048, 224, 224, 3), (3048,))

In [7]:
print(np.unique(Y_labels))

[0 1 2 3 4 5 6 7 8 9]


In [8]:
from tensorflow.keras.utils import to_categorical
Y_labels = to_categorical(Y_labels, num_classes=10)

In [9]:
X_image_train, X_image_val, X_audio_train, X_audio_val, y_train, y_val = train_test_split(
    X_image, X_audio, Y_labels, test_size=0.2, random_state=42
)

In [10]:
X_image_train.shape,X_audio_train.shape,y_train.shape

((2438, 224, 224, 3), (2438, 13, 8790), (2438, 10))

In [11]:
X_image_val.shape,X_audio_val.shape,y_val.shape

((610, 224, 224, 3), (610, 13, 8790), (610, 10))

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, LSTM, concatenate

from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.densenet import DenseNet121

# Load pre-trained VGG16 model with weights trained on ImageNet
vgg_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of VGG16
for layer in vgg_model.layers:
    layer.trainable = False

# Define the shape of the input data
audio_input_shape = (num_mfcc, X_audio.shape[2])  # (number of MFCC coefficients, audio length)
image_input_shape = X_image.shape[1:]  # (image height, image width, number of channels)

# Define the input layers
audio_input = Input(shape=audio_input_shape, name='audio_input')
image_input = Input(shape=image_input_shape, name='image_input')

# Audio processing
audio_lstm = LSTM(64)(audio_input)
audio_output = Dense(32, activation='relu')(audio_lstm)

# Image processing using VGG16
image_vgg = vgg_model(image_input)
image_flatten = Flatten()(image_vgg)
image_output = Dense(512, activation='relu')(image_flatten)

# Concatenate the outputs of audio and image processing
merged = concatenate([audio_output, image_output])
merged = Dense(512, activation='relu')(merged)
# Final output layer
output = Dense(10, activation='softmax')(merged)  # Assuming a binary classification task

# Create the model
model = Model(inputs=[image_input,audio_input, ], outputs=output)

# Compile the model (specify the appropriate loss and optimizer based on your task)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 image_input (InputLayer)    [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 audio_input (InputLayer)    [(None, 13, 8790)]           0         []                            
                                                                                                  
 inception_v3 (Functional)   (None, 5, 5, 2048)           2180278   ['image_input[0][0]']         
                                                          4                                       
                                        

In [13]:
# Train the model
model.fit(
    [X_image_train, X_audio_train],
    y_train,
    epochs=40,
    validation_data=([X_image_val, X_audio_val], y_val),
    batch_size=2
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x79db5c0ed3f0>

In [14]:
# Evaluate the model
test_loss, test_acc = model.evaluate([X_image_val, X_audio_val], y_val)
print(f"Test accuracy: {test_acc}")

Test accuracy: 0.8295081853866577


In [15]:
    class_mapping = {
    'disco': 0,
    'metal': 1,
    'reggae': 2,
    'blues': 3,
    'rock': 4,
    'classical': 5,
    'jazz': 6,
    'hiphop': 7,
    'country': 8,
    'pop': 9
    }

In [16]:
from tensorflow.keras.models import load_model
# Save the model
model.save('/kaggle/working/InceptionV3_model.h5')

  saving_api.save_model(


In [18]:
# Load the model
loaded_model = load_model('/kaggle/working/InceptionV3_model.h5')

# Specify the paths to the new audio and image files
new_audio_path = '/kaggle/input/music-recomend-data/music_recomendation_Data/music_recomendation_Data/genres_original/disco/disco.00026.wav'
new_image_path = '/kaggle/input/music-recomend-data/music_recomendation_Data/music_recomendation_Data/images_original/disco/disco00026.png'


# Load and preprocess the single image and audio data
new_img_data = load_and_preprocess_image(new_image_path)
new_audio_data = load_and_preprocess_audio(new_audio_path, max_audio_length)

# Reshape the data to match the model input shape
new_img_data = np.expand_dims(new_img_data, axis=0)
new_audio_data = np.expand_dims(new_audio_data, axis=0)

# Make a prediction using the loaded model
prediction = loaded_model.predict([new_img_data, new_audio_data])

# Get the predicted label
predicted_label = np.argmax(prediction)

print(f'The predicted label is: {predicted_label}')


The predicted label is: 0


In [19]:
from sklearn.metrics import confusion_matrix, classification_report

# Assuming you have trained your model and obtained predictions on the validation set
predictions = model.predict([X_image_val, X_audio_val])

predicted_labels = np.argmax(predictions, axis=1)
actual_labels = np.argmax(y_val, axis=1)
# Create a list of class labels
class_labels = list(class_mapping.keys())
print()
# Generate confusion matrix with class names
conf_matrix = confusion_matrix(actual_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report with class names
class_report = classification_report(actual_labels, predicted_labels, target_names=class_labels)
print("Classification Report:")
print(class_report)



Confusion Matrix:
[[49  3  1  2  1  0  4  1  2  4]
 [ 4 58  1  2  0  0  0  1  4  4]
 [ 2  0 50  0  0  0  3  2  1  2]
 [ 0  5  3 45  2  0  2  1  1  1]
 [ 1  2  2  2 47  0  0  2  1  1]
 [ 0  0  1  0  0 56  0  0  0  0]
 [ 1  2  0  2  0  0 49  0  0  2]
 [ 0  1  4  1  1  0  1 45  1  0]
 [ 0  1  0  2  3  0  1  0 57  2]
 [ 1  0  1  0  1  0  2  1  2 50]]
Classification Report:
              precision    recall  f1-score   support

       disco       0.84      0.73      0.78        67
       metal       0.81      0.78      0.79        74
      reggae       0.79      0.83      0.81        60
       blues       0.80      0.75      0.78        60
        rock       0.85      0.81      0.83        58
   classical       1.00      0.98      0.99        57
        jazz       0.79      0.88      0.83        56
      hiphop       0.85      0.83      0.84        54
     country       0.83      0.86      0.84        66
         pop       0.76      0.86      0.81        58

    accuracy                   