# EDA and Pre-Processing

## Initial EDA

In [3]:
import pandas as pd

In [None]:
df = pd.read_csv('cleaned_lyrics_gender.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_lyrics_gender.csv'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21057 entries, 0 to 21056
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   artist   21057 non-null  object
 1   seq      21057 non-null  object
 2   GENDER   21057 non-null  object
 3   IS_BAND  21054 non-null  object
dtypes: object(4)
memory usage: 658.2+ KB


In [None]:
df = df[df['GENDER'].isin(['female', 'male'])]

In [None]:
len(df)

19721

In [None]:
gender_proportions = df['GENDER'].value_counts(normalize=True)
gender_proportions

male      0.777952
female    0.222048
Name: GENDER, dtype: float64

In [None]:
band_proportions = df['IS_BAND'].value_counts(normalize=True)
band_proportions

False    0.600771
True     0.399229
Name: IS_BAND, dtype: float64

Now look at lyrics

In [None]:
df['seq'].iloc[0]

"Oh, Danny boy, the pipes, the pipes are calling\r\nFrom glen to glen, and down the mountain side.\r\nThe summer's gone, and all the roses falling,\r\nIt's you, it's you must go and I must bide.\r\n\r\nBut come ye back when summer's in the meadow,\r\nOr when the valley's hushed and white with snow,\r\nIt's I'll be here in sunshine or in shadow,\r\nOh, Danny boy, oh Danny boy, I love you so!\r\n\r\nBut when ye come, and all the flowers are dying,\r\nIf I am dead, as dead I well may be,\r\nYou'll come and find the place where I am lying,\r\nAnd kneel and say an Ave there for me.\r\nAnd I shall hear, though soft you tread above me,\r\nAnd all my grave will warmer, sweeter be,\r\nFor you will bend and tell me that you love me,\r\nAnd I shall sleep in peace until you come to me!"

## Data Pre-Procesing

### Loading Libraries

In [None]:
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Cleaning Lyrics

Here, we use the `re` library to clean the lyrics by keeping only desired charachters, turnig to lowercase and removing spaces.

In [None]:
def clean_lyrics(text):
    text = re.sub(r'[^a-zA-Z0-9\s!?]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['clean_lyrics'] = df['seq'].apply(clean_lyrics)

In [None]:
df.head()

Unnamed: 0,artist,seq,GENDER,IS_BAND,clean_lyrics
0,Elizabeth Naccarato,"Oh, Danny boy, the pipes, the pipes are callin...",female,False,oh danny boy the pipes the pipes are calling f...
1,Ella Fitzgerald,I never feel a thing is real\r\nWhen I'm away ...,female,False,i never feel a thing is real when im away from...
2,Ella Fitzgerald,"I really can't stay\r\nBut, baby, it's cold ou...",female,False,i really cant stay but baby its cold outside i...
3,Ella Fitzgerald,All my life\r\nI've been waiting for you\r\nMy...,female,False,all my life ive been waiting for you my wonder...
4,Ella Fitzgerald,I'll be down to get you in a taxi honey\r\nBet...,female,False,ill be down to get you in a taxi honey better ...


### Tokenize Lyrics

Tokenize from `NLTK` will be used to tokenoze the lyrics.
For reference if needed: https://www.nltk.org/api/nltk.tokenize.word_tokenize.html#nltk-tokenize-word-tokenize

In [None]:
def tokenize_lyrics(text):
    return word_tokenize(text)

df['tokenized_lyrics'] = df['clean_lyrics'].apply(tokenize_lyrics)

In [None]:
df.head()

Unnamed: 0,artist,seq,GENDER,IS_BAND,clean_lyrics,tokenized_lyrics
0,Elizabeth Naccarato,"Oh, Danny boy, the pipes, the pipes are callin...",female,False,oh danny boy the pipes the pipes are calling f...,"[oh, danny, boy, the, pipes, the, pipes, are, ..."
1,Ella Fitzgerald,I never feel a thing is real\r\nWhen I'm away ...,female,False,i never feel a thing is real when im away from...,"[i, never, feel, a, thing, is, real, when, im,..."
2,Ella Fitzgerald,"I really can't stay\r\nBut, baby, it's cold ou...",female,False,i really cant stay but baby its cold outside i...,"[i, really, cant, stay, but, baby, its, cold, ..."
3,Ella Fitzgerald,All my life\r\nI've been waiting for you\r\nMy...,female,False,all my life ive been waiting for you my wonder...,"[all, my, life, ive, been, waiting, for, you, ..."
4,Ella Fitzgerald,I'll be down to get you in a taxi honey\r\nBet...,female,False,ill be down to get you in a taxi honey better ...,"[ill, be, down, to, get, you, in, a, taxi, hon..."


### Stopword Removal

We loop through every token to see if it present in the stopwords, and remove it if it is.

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['filtered_lyrics'] = df['tokenized_lyrics'].apply(remove_stopwords)

In [None]:
df.head()

Unnamed: 0,artist,seq,GENDER,IS_BAND,clean_lyrics,tokenized_lyrics,filtered_lyrics
0,Elizabeth Naccarato,"Oh, Danny boy, the pipes, the pipes are callin...",female,False,oh danny boy the pipes the pipes are calling f...,"[oh, danny, boy, the, pipes, the, pipes, are, ...","[oh, danny, boy, pipes, pipes, calling, glen, ..."
1,Ella Fitzgerald,I never feel a thing is real\r\nWhen I'm away ...,female,False,i never feel a thing is real when im away from...,"[i, never, feel, a, thing, is, real, when, im,...","[never, feel, thing, real, im, away, embrace, ..."
2,Ella Fitzgerald,"I really can't stay\r\nBut, baby, it's cold ou...",female,False,i really cant stay but baby its cold outside i...,"[i, really, cant, stay, but, baby, its, cold, ...","[really, cant, stay, baby, cold, outside, got,..."
3,Ella Fitzgerald,All my life\r\nI've been waiting for you\r\nMy...,female,False,all my life ive been waiting for you my wonder...,"[all, my, life, ive, been, waiting, for, you, ...","[life, ive, waiting, wonderful, one, ive, begu..."
4,Ella Fitzgerald,I'll be down to get you in a taxi honey\r\nBet...,female,False,ill be down to get you in a taxi honey better ...,"[ill, be, down, to, get, you, in, a, taxi, hon...","[ill, get, taxi, honey, better, ready, bout, h..."


### Lemmatization

Now we lemmatize the words
Source: https://www.nltk.org/api/nltk.stem.WordNetLemmatizer.html?highlight=wordnet

In [None]:
wnl = WordNetLemmatizer()

In [None]:
def lemmatize_lyrics(tokens):
    return [wnl.lemmatize(word) for word in tokens]

df['lemmatized_lyrics'] = df['filtered_lyrics'].apply(lemmatize_lyrics)

In [None]:
df.head()

Unnamed: 0,artist,seq,GENDER,IS_BAND,clean_lyrics,tokenized_lyrics,filtered_lyrics,lemmatized_lyrics
0,Elizabeth Naccarato,"Oh, Danny boy, the pipes, the pipes are callin...",female,False,oh danny boy the pipes the pipes are calling f...,"[oh, danny, boy, the, pipes, the, pipes, are, ...","[oh, danny, boy, pipes, pipes, calling, glen, ...","[oh, danny, boy, pipe, pipe, calling, glen, gl..."
1,Ella Fitzgerald,I never feel a thing is real\r\nWhen I'm away ...,female,False,i never feel a thing is real when im away from...,"[i, never, feel, a, thing, is, real, when, im,...","[never, feel, thing, real, im, away, embrace, ...","[never, feel, thing, real, im, away, embrace, ..."
2,Ella Fitzgerald,"I really can't stay\r\nBut, baby, it's cold ou...",female,False,i really cant stay but baby its cold outside i...,"[i, really, cant, stay, but, baby, its, cold, ...","[really, cant, stay, baby, cold, outside, got,...","[really, cant, stay, baby, cold, outside, got,..."
3,Ella Fitzgerald,All my life\r\nI've been waiting for you\r\nMy...,female,False,all my life ive been waiting for you my wonder...,"[all, my, life, ive, been, waiting, for, you, ...","[life, ive, waiting, wonderful, one, ive, begu...","[life, ive, waiting, wonderful, one, ive, begu..."
4,Ella Fitzgerald,I'll be down to get you in a taxi honey\r\nBet...,female,False,ill be down to get you in a taxi honey better ...,"[ill, be, down, to, get, you, in, a, taxi, hon...","[ill, get, taxi, honey, better, ready, bout, h...","[ill, get, taxi, honey, better, ready, bout, h..."


### Updated Data Frame

In [None]:
df2 = df[["lemmatized_lyrics", "GENDER"]]
df2.head()

Unnamed: 0,lemmatized_lyrics,GENDER
0,"[oh, danny, boy, pipe, pipe, calling, glen, gl...",female
1,"[never, feel, thing, real, im, away, embrace, ...",female
2,"[really, cant, stay, baby, cold, outside, got,...",female
3,"[life, ive, waiting, wonderful, one, ive, begu...",female
4,"[ill, get, taxi, honey, better, ready, bout, h...",female


In [None]:
df2['lemmatized_lyrics'].iloc[0]

['oh',
 'danny',
 'boy',
 'pipe',
 'pipe',
 'calling',
 'glen',
 'glen',
 'mountain',
 'side',
 'summer',
 'gone',
 'rose',
 'falling',
 'must',
 'go',
 'must',
 'bide',
 'come',
 'ye',
 'back',
 'summer',
 'meadow',
 'valley',
 'hushed',
 'white',
 'snow',
 'ill',
 'sunshine',
 'shadow',
 'oh',
 'danny',
 'boy',
 'oh',
 'danny',
 'boy',
 'love',
 '!',
 'ye',
 'come',
 'flower',
 'dying',
 'dead',
 'dead',
 'well',
 'may',
 'youll',
 'come',
 'find',
 'place',
 'lying',
 'kneel',
 'say',
 'ave',
 'shall',
 'hear',
 'though',
 'soft',
 'tread',
 'grave',
 'warmer',
 'sweeter',
 'bend',
 'tell',
 'love',
 'shall',
 'sleep',
 'peace',
 'come',
 '!']

### Set Threshold for Minimum Token Count

There may be songs that have very few lyrics, so we can't extract as much information. For now, an arbitrary minimum threshold of `25` tokens is set.

In [None]:
df2['lemmatized_lyrics_length'] = df2['lemmatized_lyrics'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['lemmatized_lyrics_length'] = df2['lemmatized_lyrics'].apply(len)


In [None]:
threshold = 25

In [None]:
df_length_count = df2[df2['lemmatized_lyrics_length'] >= threshold]
df3 = df_length_count[["lemmatized_lyrics", "GENDER"]]

In [None]:
len(df3) #Decreased from 19721

19487

In [None]:
df3

Unnamed: 0,lemmatized_lyrics,GENDER
0,"[oh, danny, boy, pipe, pipe, calling, glen, gl...",female
1,"[never, feel, thing, real, im, away, embrace, ...",female
2,"[really, cant, stay, baby, cold, outside, got,...",female
3,"[life, ive, waiting, wonderful, one, ive, begu...",female
4,"[ill, get, taxi, honey, better, ready, bout, h...",female
...,...,...
21052,"[tomboy, hail, mary, never, need, dress, make,...",female
21053,"[throw, line, cant, reel, throw, dart, cant, m...",female
21054,"[mind, cluttered, kitchen, sink, heart, empty,...",female
21055,"[well, moment, heavy, im, ready, like, caged, ...",female


In [None]:
gender_proportions = df3['GENDER'].value_counts(normalize=True) # We have similar split
gender_proportions

male      0.777236
female    0.222764
Name: GENDER, dtype: float64

In [None]:
df3.to_csv("data/cleaned_eda.csv", index = False)

In [3]:
from google.colab import files
import pandas as pd

uploaded = files.upload()

df4 = pd.read_csv('cleaned_eda.csv')
df4.head()


Saving cleaned_eda.csv to cleaned_eda.csv


Unnamed: 0,lemmatized_lyrics,GENDER
0,"['oh', 'danny', 'boy', 'pipe', 'pipe', 'callin...",female
1,"['never', 'feel', 'thing', 'real', 'im', 'away...",female
2,"['really', 'cant', 'stay', 'baby', 'cold', 'ou...",female
3,"['life', 'ive', 'waiting', 'wonderful', 'one',...",female
4,"['ill', 'get', 'taxi', 'honey', 'better', 'rea...",female


In [43]:

df4 = pd.read_csv('cleaned_eda.csv')
df4.head()

Unnamed: 0,lemmatized_lyrics,GENDER
0,"['oh', 'danny', 'boy', 'pipe', 'pipe', 'callin...",female
1,"['never', 'feel', 'thing', 'real', 'im', 'away...",female
2,"['really', 'cant', 'stay', 'baby', 'cold', 'ou...",female
3,"['life', 'ive', 'waiting', 'wonderful', 'one',...",female
4,"['ill', 'get', 'taxi', 'honey', 'better', 'rea...",female


In [44]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Create a simple clean-up function for the lyrics
def clean_lyrics(text):
    text = re.sub(r'[^a-zA-Z0-9\s!?]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Tokenize the lyrics using nltk's word_tokenize
def tokenize_lyrics(text):
    return word_tokenize(text)

# Lemmatize the tokens using WordNetLemmatizer
wnl = WordNetLemmatizer()
def lemmatize_lyrics(tokens):
    return [wnl.lemmatize(word) for word in tokens]

df = df4

df['clean_lyrics'] = df['lemmatized_lyrics'].apply(clean_lyrics)
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
df['tokenized_lyrics'] = df['clean_lyrics'].apply(tokenize_lyrics)
df['lemmatized_lyrics'] = df['tokenized_lyrics'].apply(lemmatize_lyrics)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
# Create vocabulary and encode sequences
all_words = [word for sentence in df['lemmatized_lyrics'] for word in sentence]
vocab = {word: idx + 1 for idx, word in enumerate(set(all_words))}  # Map words to indices

def text_to_sequence(text, vocab):
    return [vocab.get(word, 0) for word in text]  # Use 0 for out-of-vocab words

df['lyrics_seq'] = df['lemmatized_lyrics'].apply(lambda x: text_to_sequence(x, vocab))


In [46]:
#Padding sequences to the same length
max_len = 100
def pad_sequence(sequences, max_len):
    return [seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

df['padded_lyrics'] = pad_sequence(df['lyrics_seq'], max_len)


In [47]:
from sklearn.preprocessing import LabelEncoder

# Convert labels (gender) to numeric values using LabelEncoder
label_encoder = LabelEncoder()
df['gender_label'] = label_encoder.fit_transform(df['GENDER'])


In [48]:
import numpy as np # Import numpy and assign it to the alias 'np'

from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X = np.array(df['padded_lyrics'].tolist())
y = np.array(df['gender_label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
import torch

# Convert to PyTorch tensors
X_train = torch.tensor(X_train).float()
X_test = torch.tensor(X_test).float()
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)


In [50]:
from torch.utils.data import DataLoader, TensorDataset

# Create DataLoader for batching
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [51]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x.long())
        rnn_out, _ = self.rnn(x)
        last = rnn_out[:, -1, :]
        return self.fc(last)


In [52]:
vocab_size = len(vocab)
embed_dim  = 100
hidden_size= 128
num_classes= 2

model = RNNModel(vocab_size, embed_dim, hidden_size, num_classes)


In [53]:
X_train = torch.tensor(X_train, dtype=torch.long)
X_test  = torch.tensor(X_test,  dtype=torch.long)


  X_train = torch.tensor(X_train, dtype=torch.long)
  X_test  = torch.tensor(X_test,  dtype=torch.long)


In [54]:
import torch

counts = np.bincount(y_train.numpy())
weights = 1. / counts
class_weights = torch.tensor(weights, dtype=torch.float)

criterion = nn.CrossEntropyLoss(weight=class_weights)


In [55]:

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)


# learning rate
lr = 0.001

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for inputs, labels in train_loader:
        model.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()

        with torch.no_grad():
            for param in model.parameters():
                param.data -= lr * param.grad

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")


Epoch [1/5], Loss: 0.6951
Epoch [2/5], Loss: 0.6939
Epoch [3/5], Loss: 0.6931
Epoch [4/5], Loss: 0.6922
Epoch [5/5], Loss: 0.6916


In [56]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        y_pred.extend(predicted.numpy())
        y_true.extend(labels.numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


Accuracy: 64.55%
Classification Report:
              precision    recall  f1-score   support

      female       0.24      0.27      0.25       867
        male       0.78      0.75      0.77      3031

    accuracy                           0.65      3898
   macro avg       0.51      0.51      0.51      3898
weighted avg       0.66      0.65      0.65      3898

