# Lyrics Generator

In [39]:
import os
import pandas as pd

In [40]:

# Define input and output paths
folder_path = r"E:\Lyrics_Generator\Data_Sets\Songs"
output_folder = r"E:\Lyrics_Generator\Data_Sets"
output_file = os.path.join(output_folder, "merged_songs.csv")

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Read and merge
dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
merged_df = pd.concat(dfs, ignore_index=True)

# Save to CSV
merged_df.to_csv(output_file, index=False)
print(f"Merged {len(csv_files)} CSV files into '{output_file}' successfully!")

# folder_path = "E:\AI\Lyrics_Generator_RNN\Song_dataset"

# csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
# merged_df = pd.concat(dfs, ignore_index=True)

# merged_df.to_csv(r"E:\AI\Lyrics_Generator_RNN\Data_sets\merged_songs.csv", index=False)

# print(f"Merged {len(csv_files)} CSV files into 'merged_songs.csv' successfully!")


Merged 20 CSV files into 'E:\Lyrics_Generator\Data_Sets\merged_songs.csv' successfully!


## Loading the DataSets

In [41]:
df = pd.read_csv(r"E:\Lyrics_Generator\Data_sets\merged_songs.csv")

In [42]:
df.head(3)

Unnamed: 0.1,Artist,Title,Album,Date,Lyric,Year,Unnamed: 0
0,Ariana Grande,"​thank u, next","thank u, next",2018-11-03,thought i'd end up with sean but he wasn't a m...,2018.0,
1,Ariana Grande,7 rings,"thank u, next",2019-01-18,yeah breakfast at tiffany's and bottles of bub...,2019.0,
2,Ariana Grande,​God is a woman,Sweetener,2018-07-13,you you love it how i move you you love it how...,2018.0,


## Exploring the data

In [43]:
df['Artist'].value_counts()

Artist
Eminem           521
Taylor Swift     479
Drake            466
Beyoncé          406
Rihanna          405
Lady Gaga        402
Justin Bieber    348
Coldplay         344
Katy Perry       325
Nicki Minaj      323
Ariana Grande    308
Ed Sheeran       296
Dua Lipa         247
Maroon 5         197
Selena Gomez     175
Post Malone      148
Billie Eilish    145
Charlie Puth      75
Cardi B           75
Khalid            64
Name: count, dtype: int64

In [44]:
df.shape

(5749, 7)

## Checking for the missing data

In [45]:
df.isnull().sum()

Artist           0
Title            0
Album         1552
Date          1744
Lyric           38
Year          1744
Unnamed: 0     308
dtype: int64

In [46]:
df = df.drop(columns=['Album', 'Date', 'Year', 'Unnamed: 0'])

In [47]:
df.isnull().sum()

Artist     0
Title      0
Lyric     38
dtype: int64

In [48]:
df = df.dropna(subset=['Lyric'])  # Remove rows with missing lyrics
df = df.drop_duplicates()  # Remove duplicate lyrics

In [49]:
df.isnull().sum()

Artist    0
Title     0
Lyric     0
dtype: int64

In [50]:
df.shape

(5711, 3)

In [51]:
df['Artist'].value_counts()

Artist
Eminem           521
Taylor Swift     477
Drake            464
Beyoncé          406
Rihanna          397
Lady Gaga        395
Justin Bieber    347
Coldplay         333
Katy Perry       324
Nicki Minaj      321
Ariana Grande    308
Ed Sheeran       294
Dua Lipa         246
Maroon 5         197
Selena Gomez     174
Post Malone      148
Billie Eilish    145
Charlie Puth      75
Cardi B           75
Khalid            64
Name: count, dtype: int64

## Text Preprocessing

In [52]:
new_df = df.copy()

In [53]:
new_df

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​God is a woman,you you love it how i move you you love it how...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...
...,...,...,...
5744,Taylor Swift,Teardrops on my Guitar (Live from Clear Channe...,drew looks at me i fake a smile so he won't se...
5745,Taylor Swift,Evermore [Forward],to put it plainly we just couldnt stop writing...
5746,Taylor Swift,Welcome Back Grunwald,turn wycd on you're on your grunwald back from...
5747,Taylor Swift,Tolerate it (Polskie Tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową p...


### Lowercasing

In [54]:
new_df['Title'] = new_df['Title'].str.lower()
new_df['Lyric'] = new_df['Lyric'].str.lower()

In [55]:
new_df.head(3)

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...


#### Cleaning functions

In [56]:
import re

In [57]:
# Pre-checking the lyrics
def has_html_tags(text):
    """Check if text contains HTML tags."""
    if not isinstance(text, str):
        return False
    return bool(re.search(r'<[^>]+>', text))

def has_urls(text):
    """Check if text contains URLs."""
    if not isinstance(text, str):
        return False
    url_pattern = r'https?://\S+|www\.\S+|\S+\.\S+/\S+'
    return bool(re.search(url_pattern, text))

# def has_punctuation(text):
#     """Check if text contains punctuation."""
#     if not isinstance(text, str):
#         return False
#     return any(c in string.punctuation for c in text)

In [58]:
# Function to remove html tags and urls
def remove_html_tags(text):
    """Remove HTML tags from text."""
    if not isinstance(text, str):
        return text
    pattern = re.compile(r'<.*?>')
    return pattern.sub('', text).strip()

def remove_urls(text):
    """Remove URLs from text."""
    if not isinstance(text, str):
        return text
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text).strip()

# def remove_punctuation(text):
#     """Remove punctuation from text."""
#     if not isinstance(text, str):
#         return text
#     return text.translate(str.maketrans('', '', string.punctuation))

In [59]:
lyric_html_count = new_df['Lyric'].apply(has_html_tags).sum()
lyric_url_count = new_df['Lyric'].apply(has_urls).sum()
# lyric_punctuation_count = new_df['Lyric'].apply(has_punctuation).sum()

# title_html_count = new_df['Title'].apply(has_html_tags).sum()
# title_url_count = new_df['Title'].apply(has_urls).sum()
# title_punctuation_count = new_df['Title'].apply(has_punctuation).sum()

total_rows = len(new_df)
print("Precheck Summary:")
print(f"Lyrics with HTML tags: {lyric_html_count} ({(lyric_html_count/total_rows)*100:.2f}%)")
print(f"Lyrics with URLs: {lyric_url_count} ({(lyric_url_count/total_rows)*100:.2f}%)")
# print(f"Lyrics with punctuation: {lyric_punctuation_count} ({(lyric_punctuation_count/total_rows)*100:.2f}%)")

# print(f"Titles with HTML tags: {title_html_count} ({(title_html_count/total_rows)*100:.2f}%)")
# print(f"Titles with URLs: {title_url_count} ({(title_url_count/total_rows)*100:.2f}%)")
# print(f"Titles with punctuation: {title_punctuation_count} ({(title_punctuation_count/total_rows)*100:.2f}%)")

Precheck Summary:
Lyrics with HTML tags: 0 (0.00%)
Lyrics with URLs: 0 (0.00%)


In [60]:
# Cleaning based on pre-check results

if lyric_html_count > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(remove_html_tags)

if lyric_url_count > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(remove_urls)
    
# if lyric_punctuation_count > 0:
#     new_df['Lyric'] = new_df['Lyric'].apply(remove_punctuation)

In [61]:
# Recalculate after cleaning
lyric_html_count_after = new_df['Lyric'].apply(has_html_tags).sum()
lyric_url_count_after = new_df['Lyric'].apply(has_urls).sum()
# lyric_punctuation_count_after = new_df['Lyric'].apply(has_punctuation).sum()

print("\nPost-cleaning Result:")
print(f"Lyrics with HTML tags: {lyric_html_count_after} ({(lyric_html_count_after/total_rows)*100:.2f}%)")
print(f"Lyrics with URLs: {lyric_url_count_after} ({(lyric_url_count_after/total_rows)*100:.2f}%)")
# print(f"Lyrics with punctuation: {lyric_punctuation_count_after} ({(lyric_punctuation_count_after/total_rows)*100:.2f}%)")



Post-cleaning Result:
Lyrics with HTML tags: 0 (0.00%)
Lyrics with URLs: 0 (0.00%)


In [62]:
new_df['Lyric'].head()

0    thought i'd end up with sean but he wasn't a m...
1    yeah breakfast at tiffany's and bottles of bub...
2    you you love it how i move you you love it how...
3    ariana grande  nicki minaj i've been here all ...
4    right now i'm in a state of mind i wanna be in...
Name: Lyric, dtype: object

## Chat Words Treatment

In [63]:
chat_words ={
    "u": "you",
    "ur": "your",
    "r": "are",
    "ya": "you",
    "luv": "love",
    "b4": "before",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "im": "i am",
    "dont": "don't",
    "idk": "i don't know",
    "brb": "be right back",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "ttyl": "talk to you later",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "bff": "best friends forever",
    "ily": "i love you",
    "jk": "just kidding",
    "bday": "birthday",
    "cuz": "because",
    "coz": "because",
    "thx": "thanks",
    "pls": "please",
    "plz": "please",
    "bae": "before anyone else",
    "dm": "direct message",
    "noob": "beginner",
    "sk8": "skate",
    "gr8": "great",
    "np": "no problem",
    "l8r": "later",
    "idc": "i don't care",
    "zzz": "sleeping",
    "omw": "on my way"
}

# Checking the number of lyrics containing the chat words

def contains_chat_words(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    return any(word in chat_words for word in words)

# Check how many lyrics contain chat words
chat_word_count = new_df['Lyric'].apply(contains_chat_words).sum()
print(f"Number of lyrics containing chat words: {chat_word_count}")

# Removing the chat words from the lyrics

def chat_word_treatment(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    normalized_words = [chat_words.get(w.lower(), w) for w in words]
    return ' '.join(normalized_words)

# Apply to the lyric column
new_df['Lyric'] = new_df['Lyric'].apply(chat_word_treatment)

chat_word_count_after = new_df['Lyric'].apply(contains_chat_words).sum()
print(f"Number of lyrics containing chat words: {chat_word_count_after}")

Number of lyrics containing chat words: 3374
Number of lyrics containing chat words: 0


## Spelling Correction

In [64]:
pip install pyspellchecker




In [65]:
# from spellchecker import SpellChecker

# spell = SpellChecker()

# def count_misspelled_words(text):
#     if not isinstance(text, str):
#         return 0
#     words = text.split()
#     misspelled = spell.unknown(words)
#     return len(misspelled)

# total_misspelled_words = new_df['Lyric'].apply(count_misspelled_words).sum()
# print(f"Total misspelled words: {total_misspelled_words}")

# # def correct_spelling(text):
# #     if not isinstance(text, str):
# #         return text
# #     corrected_words = []
# #     for word in text.split():
# #         corrected = spell.correction(word)
# #         corrected_words.append(corrected if corrected else word)
# #     return ' '.join(corrected_words)

from spellchecker import SpellChecker

spell = SpellChecker()

# Function to return misspelled words in a text
def get_misspelled_words(text):
    if not isinstance(text, str):
        return []
    words = text.split()
    return list(spell.unknown(words))

# create a new column with misspelled words
new_df['Misspelled_Words'] = new_df['Lyric'].apply(get_misspelled_words)

# Total misspelled words count
total_misspelled_words = new_df['Misspelled_Words'].apply(len).sum()
print(f"Total misspelled words: {total_misspelled_words}")

# View the misspelled words only (unique across dataset)
from itertools import chain

all_misspelled = list(chain.from_iterable(new_df['Misspelled_Words']))
unique_misspelled = set(all_misspelled)

print(f"Unique misspelled words ({len(unique_misspelled)}):")
print(unique_misspelled)

# View the first 10 rows with their misspelled words
print(new_df[['Misspelled_Words']].head(10))


Total misspelled words: 59049
Unique misspelled words (19201):
{"'cise", "cornylookin'", 'roppo', 'antonoff', 'shoobydedoowop', 'gyllenhaal', 'bebek', 'kis', 'anahtar', "''poo", 'meryl', 'umdadahdadahda', 'dusze', 'nalick', 'jarrell', 'eart', 'gstrings', 'tellin', 'αίωνiαις', "trucetrucedon't", 'navoho', 'popatrz', 'frizzench', 'goride', 'estow', 'luthor', 'anydamnwhere', 'loohohove', 'götüme', 'sonman', 'ایهام', 'ocurr', 'ediyorsunuz', 'apač', 'gue', 'irs', 'worldworldworld', 'ohohohoh', 'breakbeats', 'flavour', 'coulent', 'favourite', "ya's", 'дать', 'pussyass', 'nfing', 'bumass', 'piernas', "boo'd", 'conin', 'aacting', 'منظور', 'noohohoh', "m'ucht", "gallenson's", 'wheewheewheewheewheewheewheelers', 'hackup', 'hari', 'ئەگەر', 'oldo', 'spreee', 'yodelodelayheehoo', 'woop', 'wiggin', 'بی', 'smallass', "answerin'", "flamin'", 'wahlberg', 'bbt', 'beyonceing', 'fernandx', 'abuhbuhbuh', "hoggin'", 'seducción', 'sasame', "lypo'd", 'laidback', "firin'", 'keester', 'meghalnának', 'shooin', '

In [66]:
new_df

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[pre, nothin', ari, 'bout, havin', gon', 'caus..."
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"[pre, gleamin', poppin', tiffany's, atm, would..."
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[pre, tellin', onetrack, uniwhen, feelin']"
3,Ariana Grande,side to side,ariana grande nicki minaj i've been here all n...,"[nicki, grande, ariana, minaj, tryna, pre, tal..."
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i want to be ...,"[pre, ohyeah, rainin', comin', 'em, turnin', p..."
...,...,...,...,...
5744,Taylor Swift,teardrops on my guitar (live from clear channe...,drew looks at me i fake a smile so he won't se...,"[pre, 'cause, whos]"
5745,Taylor Swift,evermore [forward],to put it plainly we just couldnt stop writing...,"[imaginarynot, couldnt, folklorian, dreamscape..."
5746,Taylor Swift,welcome back grunwald,turn wycd on you're on your grunwald back from...,"[wycd, grunwald, grunwald's]"
5747,Taylor Swift,tolerate it (polskie tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową po...,"[błagam, wyjęła, tyle, refren, inne, budzę, tw..."


In [69]:
# Filtering the non-English Words

from nltk.corpus import words

english_words = set(words.words())

def remove_non_english_words(text):
    if not isinstance(text, str):
        return text
    return ' '.join([word for word in text.split() if word.lower() in english_words])

new_df['Lyric'] = new_df['Lyric'].apply(remove_non_english_words)


In [70]:
misspelled_words_after_filter= new_df['Lyric'].apply(get_misspelled_words)

# Total misspelled words count
total_misspelled_words_after_filter = new_df['Misspelled_Words'].apply(len).sum()
print(f"Total misspelled words: {total_misspelled_words_after_filter}")

Total misspelled words: 59049


In [None]:
test = "tttt ten توڕەیم" 
tested = remove_non_english_words(test)
print(f"Tested string after removing non-English words: '{tested}'")

Tested string after removing non-English words: 'ten'


In [73]:
# function to correct spelling in lyrics
# def correct_spelling(text):
#     if not isinstance(text, str):
#         return text
#     corrected_words = []
#     for word in text.split():
#         corrected = spell.correction(word)
#         corrected_words.append(corrected if corrected else word)
#     return ' '.join(corrected_words)

# new_df['Lyric'] = new_df['Lyric'].apply(correct_spelling)

# misspelled_words_check = new_df['Lyric'].apply(get_misspelled_words).sum()
# print(f"Total misspelled words after correction: {len(misspelled_words_check)}")



# # Step 1: Get all words
# all_words = set(word for lyric in new_df['Lyric'] if isinstance(lyric, str) for word in lyric.split())

# # Step 2: Separate known and unknown
# misspelled_words = spell.unknown(all_words)

# # Step 3: Correct only the misspelled ones
# correction_dict = {word: spell.correction(word) for word in misspelled_words}

# # Step 4: Apply correction only where needed
# def correct_spelling_faster(text):
#     if not isinstance(text, str):
#         return text
#     return ' '.join([correction_dict.get(word, word) for word in text.split()])

# new_df['Lyric'] = new_df['Lyric'].apply(correct_spelling_faster)


def correct_spelling(text):
    words = text.split()
    corrected_words = []
    misspelled = spell.unknown(words)
    for word in words:
        if word in misspelled:
            correction = spell.correction(word)
            corrected_words.append(correction if correction else word)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)


In [None]:
if total_misspelled_words > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(correct_spelling)
    

new_df['Misspelled_Words'] = new_df['Lyric'].apply(get_misspelled_words)
total_misspelled_words_post = new_df['Misspelled_Words'].apply(len).sum()

## Feature Engineering

In [None]:
# # Creating a mapping dictionary from artist to genre
# genre_map = {
#     'Eminem': 'Hip Hop / Rap',
#     'Taylor Swift': 'Pop / Country / Synthpop',
#     'Drake': 'Hip Hop / Rap / R&B',
#     'Beyoncé': 'R&B / Pop / Hip Hop',
#     'Rihanna': 'Pop / R&B / Dancehall',
#     'Lady Gaga': 'Pop / Dance / Electronic',
#     'Justin Bieber': 'Pop / R&B',
#     'Coldplay': 'Alternative Rock / Pop Rock',
#     'Katy Perry': 'Pop',
#     'Nicki Minaj': 'Hip Hop / Rap / Pop',
#     'Ariana Grande': 'Pop / R&B',
#     'Ed Sheeran': 'Pop / Folk Pop',
#     'BTS (방탄소년단)': 'K-pop / Pop / Hip Hop',
#     'Dua Lipa': 'Pop / Dance / Disco',
#     'Maroon 5': 'Pop Rock / Funk Pop',
#     'Selena Gomez': 'Pop / Dance Pop',
#     'Post Malone': 'Hip Hop / Pop / Trap',
#     'Billie Eilish': 'Alternative / Pop / Electropop',
#     'Charlie Puth': 'Pop / R&B',
#     'Cardi B': 'Hip Hop / Rap',
#     'Khalid': 'R&B / Pop / Soul'
# }

In [None]:
# Adding the new 'Genre' column based on the mapping

# df['Genre'] = df['Artist'].map(genre_map)

In [None]:
# df.head()

Unnamed: 0,Artist,Title,Lyric,Genre
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,Pop / R&B
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,Pop / R&B
2,Ariana Grande,​God is a woman,you you love it how i move you you love it how...,Pop / R&B
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...,Pop / R&B
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...,Pop / R&B


In [None]:
# df['Genre'].isnull().sum()

0

In [None]:
# df['Genre'].value_counts()

Genre
Pop / R&B                         730
Hip Hop / Rap                     596
Pop / Country / Synthpop          477
Hip Hop / Rap / R&B               464
R&B / Pop / Hip Hop               406
Pop / R&B / Dancehall             397
Pop / Dance / Electronic          395
Alternative Rock / Pop Rock       333
Pop                               324
Hip Hop / Rap / Pop               321
Pop / Folk Pop                    294
K-pop / Pop / Hip Hop             270
Pop / Dance / Disco               246
Pop Rock / Funk Pop               197
Pop / Dance Pop                   174
Hip Hop / Pop / Trap              148
Alternative / Pop / Electropop    145
R&B / Pop / Soul                   64
Name: count, dtype: int64

In [None]:
# df['Genre'].unique()

array(['Pop / R&B', 'R&B / Pop / Hip Hop',
       'Alternative / Pop / Electropop', 'K-pop / Pop / Hip Hop',
       'Hip Hop / Rap', 'Alternative Rock / Pop Rock',
       'Hip Hop / Rap / R&B', 'Pop / Dance / Disco', 'Pop / Folk Pop',
       'Pop', 'R&B / Pop / Soul', 'Pop / Dance / Electronic',
       'Pop Rock / Funk Pop', 'Hip Hop / Rap / Pop',
       'Hip Hop / Pop / Trap', 'Pop / R&B / Dancehall', 'Pop / Dance Pop',
       'Pop / Country / Synthpop'], dtype=object)