In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MultiLabelBinarizer


# Predicting the Genres of Songs by their Lyrics
In this project, we aim to predict musical genres based on song lyrics and artist characteristics. .


### Load the Datasets
We have two datasets: one with lyrics (lyrics_dataset.csv) and another with artist information (artists_dataset.csv). We'll load both into pandas DataFrames.

In [38]:
artists_df = pd.read_csv('data/artists-data.csv')
lyrics_df = pd.read_csv('data/lyrics-data.csv',delimiter=',', quotechar='"')

### Data Preprocessing
Before we can analyze the data, we need to preprocess it. This includes standardizing the column with the link to the artist's account and merging the two datasets on this column.

In [39]:
# Remove leading/trailing spaces if any
artists_df['Link'] = artists_df['Link'].str.strip()
lyrics_df['ALink'] = lyrics_df['ALink'].str.strip()

# Merge the datasets on the link to the artist's account
merged_df = pd.merge(lyrics_df, artists_df, left_on='ALink', right_on='Link', how='left')



Dropping the rows with missing lyrics or genres

In [40]:
# handle missing values (drop rows with missing lyrics or genres)
merged_df.dropna(subset=['Lyric', 'Genres'], inplace=True)

# Replace \r and \n in 'Lyric' only if the entry is a string
merged_df['Lyric'] = merged_df['Lyric'].apply(lambda x: x.replace('\r', '').replace('\n', ' ') if isinstance(x, str) else x)



### Filter Out Non-English Songs

In [41]:
# Exclude songs with Spanish lyrics
merged_df = merged_df[merged_df['language'] == 'en']

# Display a random sample of 5 rows from the dataframe
print(merged_df.head())


               ALink                                              SName  \
69   /ivete-sangalo/                                   Careless Whisper   
86   /ivete-sangalo/  Could You Be Loved / Citação Musical do Rap: S...   
88   /ivete-sangalo/                             Cruisin' (Part. Saulo)   
111  /ivete-sangalo/                                               Easy   
140  /ivete-sangalo/                  For Your Babies (The Voice cover)   

                                                 SLink  \
69                /ivete-sangalo/careless-whisper.html   
86   /ivete-sangalo/could-you-be-loved-citacao-musi...   
88              /ivete-sangalo/cruisin-part-saulo.html   
111                           /ivete-sangalo/easy.html   
140  /ivete-sangalo/for-your-babies-the-voice-cover...   

                                                 Lyric language  \
69   I feel so unsure As I take your hand and lead ...       en   
86   Don't let them fool, ya Or even try to school,...       en   

In [42]:

from collections import Counter

# Splitting the genres and creating a flat list
all_genres = [genre for sublist in merged_df['Genres'].dropna() for genre in sublist.split(';')]

# Counting the occurrences of each genre
genre_counts = Counter(all_genres)

# Displaying the most common genres
for genre, count in genre_counts.most_common():
    print(f"{genre}: {count}")




 Rock: 38540
Rock: 25177
 Pop: 19812
 Pop/Rock: 17341
 Romântico: 14267
Pop: 13759
Heavy Metal: 13496
Indie: 12998
 Hard Rock: 12618
 Hip Hop: 12490
 Black Music: 11405
 R&B: 10272
Rap: 9589
Pop/Rock: 9019
Hip Hop: 8412
 Heavy Metal: 8263
 Dance: 7643
 Rap: 7525
 Rock Alternativo: 7506
Country: 7377
 Indie: 7136
 Folk: 7069
Rock Alternativo: 5555
R&B: 5309
 Punk Rock: 5058
Gospel/Religioso: 5017
 Electronica: 4835
Hard Rock: 4632
Soul Music: 4518
Dance: 4252
Punk Rock: 4157
 Blues: 4056
Folk: 4055
 Hardcore: 4009
Soft Rock: 3863
Romântico: 3680
 Soul Music: 3613
 Country: 3253
 Gótico: 3138
Trilha Sonora: 3112
 Trilha Sonora: 3101
Jazz: 3086
Hardcore: 3073
Electronica: 2964
 Tecnopop: 2879
Progressivo: 2497
Rockabilly: 2495
 Gospel/Religioso: 2320
 Funk: 2094
Blues: 2038
 Pós-Punk: 2037
Reggae: 1990
 Progressivo: 1868
 Pop/Punk: 1860
Gótico: 1846
 Clássico: 1814
 Grunge: 1752
 Psicodelia: 1717
 Instrumental: 1717
 Disco: 1630
Black Music: 1619
 Axé: 1606
 New Wave: 1585
New Wave: 1490
