# Lyrics Generator

In [2]:
import os
import pandas as pd

In [3]:

# Define input and output paths
folder_path = r"E:\Lyrics_Generator\Data_Sets\Songs"
output_folder = r"E:\Lyrics_Generator\Data_Sets"
output_file = os.path.join(output_folder, "merged_songs.csv")

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Read and merge
dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
merged_df = pd.concat(dfs, ignore_index=True)

# Save to CSV
merged_df.to_csv(output_file, index=False)
print(f"Merged {len(csv_files)} CSV files into '{output_file}' successfully!")

# folder_path = "E:\AI\Lyrics_Generator_RNN\Song_dataset"

# csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
# merged_df = pd.concat(dfs, ignore_index=True)

# merged_df.to_csv(r"E:\AI\Lyrics_Generator_RNN\Data_sets\merged_songs.csv", index=False)

# print(f"Merged {len(csv_files)} CSV files into 'merged_songs.csv' successfully!")


Merged 20 CSV files into 'E:\Lyrics_Generator\Data_Sets\merged_songs.csv' successfully!


## Loading the DataSets

In [4]:
df = pd.read_csv(r"E:\Lyrics_Generator\Data_sets\merged_songs.csv")

In [5]:
df.head(3)

Unnamed: 0.1,Artist,Title,Album,Date,Lyric,Year,Unnamed: 0
0,Ariana Grande,"​thank u, next","thank u, next",2018-11-03,thought i'd end up with sean but he wasn't a m...,2018.0,
1,Ariana Grande,7 rings,"thank u, next",2019-01-18,yeah breakfast at tiffany's and bottles of bub...,2019.0,
2,Ariana Grande,​God is a woman,Sweetener,2018-07-13,you you love it how i move you you love it how...,2018.0,


## Exploring the data

In [6]:
df['Artist'].value_counts()

Artist
Eminem           521
Taylor Swift     479
Drake            466
Beyoncé          406
Rihanna          405
Lady Gaga        402
Justin Bieber    348
Coldplay         344
Katy Perry       325
Nicki Minaj      323
Ariana Grande    308
Ed Sheeran       296
Dua Lipa         247
Maroon 5         197
Selena Gomez     175
Post Malone      148
Billie Eilish    145
Charlie Puth      75
Cardi B           75
Khalid            64
Name: count, dtype: int64

In [7]:
df.shape

(5749, 7)

## Checking for the missing data

In [8]:
df.isnull().sum()

Artist           0
Title            0
Album         1552
Date          1744
Lyric           38
Year          1744
Unnamed: 0     308
dtype: int64

In [9]:
df = df.drop(columns=['Album', 'Date', 'Year', 'Unnamed: 0'])

In [10]:
df.isnull().sum()

Artist     0
Title      0
Lyric     38
dtype: int64

In [11]:
df = df.dropna(subset=['Lyric'])  # Remove rows with missing lyrics
df = df.drop_duplicates()  # Remove duplicate lyrics

In [12]:
df.isnull().sum()

Artist    0
Title     0
Lyric     0
dtype: int64

In [13]:
df.shape

(5711, 3)

In [14]:
df['Artist'].value_counts()

Artist
Eminem           521
Taylor Swift     477
Drake            464
Beyoncé          406
Rihanna          397
Lady Gaga        395
Justin Bieber    347
Coldplay         333
Katy Perry       324
Nicki Minaj      321
Ariana Grande    308
Ed Sheeran       294
Dua Lipa         246
Maroon 5         197
Selena Gomez     174
Post Malone      148
Billie Eilish    145
Charlie Puth      75
Cardi B           75
Khalid            64
Name: count, dtype: int64

## Text Preprocessing

In [15]:
new_df = df.copy()

In [16]:
new_df

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​God is a woman,you you love it how i move you you love it how...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...
...,...,...,...
5744,Taylor Swift,Teardrops on my Guitar (Live from Clear Channe...,drew looks at me i fake a smile so he won't se...
5745,Taylor Swift,Evermore [Forward],to put it plainly we just couldnt stop writing...
5746,Taylor Swift,Welcome Back Grunwald,turn wycd on you're on your grunwald back from...
5747,Taylor Swift,Tolerate it (Polskie Tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową p...


### Lowercasing

In [17]:
new_df['Title'] = new_df['Title'].str.lower()
new_df['Lyric'] = new_df['Lyric'].str.lower()

In [18]:
new_df.head(3)

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...


#### Cleaning functions

In [19]:
import re

In [20]:
# Pre-checking the lyrics
def has_html_tags(text):
    """Check if text contains HTML tags."""
    if not isinstance(text, str):
        return False
    return bool(re.search(r'<[^>]+>', text))

def has_urls(text):
    """Check if text contains URLs."""
    if not isinstance(text, str):
        return False
    url_pattern = r'https?://\S+|www\.\S+|\S+\.\S+/\S+'
    return bool(re.search(url_pattern, text))

# def has_punctuation(text):
#     """Check if text contains punctuation."""
#     if not isinstance(text, str):
#         return False
#     return any(c in string.punctuation for c in text)

In [21]:
# Function to remove html tags and urls
def remove_html_tags(text):
    """Remove HTML tags from text."""
    if not isinstance(text, str):
        return text
    pattern = re.compile(r'<.*?>')
    return pattern.sub('', text).strip()

def remove_urls(text):
    """Remove URLs from text."""
    if not isinstance(text, str):
        return text
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text).strip()

# def remove_punctuation(text):
#     """Remove punctuation from text."""
#     if not isinstance(text, str):
#         return text
#     return text.translate(str.maketrans('', '', string.punctuation))

In [22]:
lyric_html_count = new_df['Lyric'].apply(has_html_tags).sum()
lyric_url_count = new_df['Lyric'].apply(has_urls).sum()
# lyric_punctuation_count = new_df['Lyric'].apply(has_punctuation).sum()

# title_html_count = new_df['Title'].apply(has_html_tags).sum()
# title_url_count = new_df['Title'].apply(has_urls).sum()
# title_punctuation_count = new_df['Title'].apply(has_punctuation).sum()

total_rows = len(new_df)
print("Precheck Summary:")
print(f"Lyrics with HTML tags: {lyric_html_count} ({(lyric_html_count/total_rows)*100:.2f}%)")
print(f"Lyrics with URLs: {lyric_url_count} ({(lyric_url_count/total_rows)*100:.2f}%)")
# print(f"Lyrics with punctuation: {lyric_punctuation_count} ({(lyric_punctuation_count/total_rows)*100:.2f}%)")

# print(f"Titles with HTML tags: {title_html_count} ({(title_html_count/total_rows)*100:.2f}%)")
# print(f"Titles with URLs: {title_url_count} ({(title_url_count/total_rows)*100:.2f}%)")
# print(f"Titles with punctuation: {title_punctuation_count} ({(title_punctuation_count/total_rows)*100:.2f}%)")

Precheck Summary:
Lyrics with HTML tags: 0 (0.00%)
Lyrics with URLs: 0 (0.00%)


In [23]:
# Cleaning based on pre-check results

if lyric_html_count > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(remove_html_tags)

if lyric_url_count > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(remove_urls)
    
# if lyric_punctuation_count > 0:
#     new_df['Lyric'] = new_df['Lyric'].apply(remove_punctuation)

In [24]:
# Recalculate after cleaning
lyric_html_count_after = new_df['Lyric'].apply(has_html_tags).sum()
lyric_url_count_after = new_df['Lyric'].apply(has_urls).sum()
# lyric_punctuation_count_after = new_df['Lyric'].apply(has_punctuation).sum()

print("\nPost-cleaning Result:")
print(f"Lyrics with HTML tags: {lyric_html_count_after} ({(lyric_html_count_after/total_rows)*100:.2f}%)")
print(f"Lyrics with URLs: {lyric_url_count_after} ({(lyric_url_count_after/total_rows)*100:.2f}%)")
# print(f"Lyrics with punctuation: {lyric_punctuation_count_after} ({(lyric_punctuation_count_after/total_rows)*100:.2f}%)")



Post-cleaning Result:
Lyrics with HTML tags: 0 (0.00%)
Lyrics with URLs: 0 (0.00%)


In [25]:
new_df['Lyric'].head()

0    thought i'd end up with sean but he wasn't a m...
1    yeah breakfast at tiffany's and bottles of bub...
2    you you love it how i move you you love it how...
3    ariana grande  nicki minaj i've been here all ...
4    right now i'm in a state of mind i wanna be in...
Name: Lyric, dtype: object

## Chat Words Treatment

In [26]:
chat_words ={
    "u": "you",
    "ur": "your",
    "r": "are",
    "ya": "you",
    "luv": "love",
    "b4": "before",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "im": "i am",
    "dont": "don't",
    "idk": "i don't know",
    "brb": "be right back",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "ttyl": "talk to you later",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "bff": "best friends forever",
    "ily": "i love you",
    "jk": "just kidding",
    "bday": "birthday",
    "cuz": "because",
    "coz": "because",
    "thx": "thanks",
    "pls": "please",
    "plz": "please",
    "bae": "before anyone else",
    "dm": "direct message",
    "noob": "beginner",
    "sk8": "skate",
    "gr8": "great",
    "np": "no problem",
    "l8r": "later",
    "idc": "i don't care",
    "zzz": "sleeping",
    "omw": "on my way"
}

# Checking the number of lyrics containing the chat words

def contains_chat_words(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    return any(word in chat_words for word in words)

# Check how many lyrics contain chat words
chat_word_count = new_df['Lyric'].apply(contains_chat_words).sum()
print(f"Number of lyrics containing chat words: {chat_word_count}")

# Removing the chat words from the lyrics

def chat_word_treatment(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    normalized_words = [chat_words.get(w.lower(), w) for w in words]
    return ' '.join(normalized_words)

# Apply to the lyric column
new_df['Lyric'] = new_df['Lyric'].apply(chat_word_treatment)

chat_word_count_after = new_df['Lyric'].apply(contains_chat_words).sum()
print(f"Number of lyrics containing chat words: {chat_word_count_after}")

Number of lyrics containing chat words: 3374
Number of lyrics containing chat words: 0


## Spelling Correction

In [27]:
pip install pyspellchecker

Note: you may need to restart the kernel to use updated packages.


In [38]:
# from spellchecker import SpellChecker

# spell = SpellChecker()

# def count_misspelled_words(text):
#     if not isinstance(text, str):
#         return 0
#     words = text.split()
#     misspelled = spell.unknown(words)
#     return len(misspelled)

# total_misspelled_words = new_df['Lyric'].apply(count_misspelled_words).sum()
# print(f"Total misspelled words: {total_misspelled_words}")

# # def correct_spelling(text):
# #     if not isinstance(text, str):
# #         return text
# #     corrected_words = []
# #     for word in text.split():
# #         corrected = spell.correction(word)
# #         corrected_words.append(corrected if corrected else word)
# #     return ' '.join(corrected_words)

from spellchecker import SpellChecker

spell = SpellChecker()

# Function to return misspelled words in a text
def get_misspelled_words(text):
    if not isinstance(text, str):
        return []
    words = text.split()
    return list(spell.unknown(words))

# # create a new column with misspelled words
# new_df['Misspelled_Words'] = new_df['Lyric'].apply(get_misspelled_words)

# # Total misspelled words count
# total_misspelled_words = new_df['Misspelled_Words'].apply(len).sum()
# print(f"Total misspelled words: {total_misspelled_words}")

# # View the misspelled words only (unique across dataset)
# from itertools import chain

# all_misspelled = list(chain.from_iterable(new_df['Misspelled_Words']))
# unique_misspelled = set(all_misspelled)

# print(f"Unique misspelled words ({len(unique_misspelled)}):")
# print(unique_misspelled)

# # View the first 10 rows with their misspelled words
# print(new_df[['Misspelled_Words']].head(10))

misspelled_words = new_df['Lyric'].apply(get_misspelled_words)
total_misspelled_words = misspelled_words.apply(len).sum()
print(f"Total misspelled words: {total_misspelled_words}")

# Unique misspelled words
unique_misspelled = set(misspelled_words.explode().dropna().tolist())
print(f"Unique misspelled words ({len(unique_misspelled)}):")
print(unique_misspelled)  

new_df['Misspelled_Words'] = new_df['Lyric'].apply(get_misspelled_words)
  


Total misspelled words: 59049
Unique misspelled words (19201):
{'corso', 'miseducation', 'impalis', 'juga', 'mungojerrie', 'nnashville', '888cutsomething', 'može', 'pióra', 'alejandrx', 'cada', 'rosado', 'امامنهمونطور', 'dropswhat', 'nezůstane', "tremblin'", 'bedava', 'dziecko', 'lbc', "jigglin'", 'royale', 'baisley', 'καρδία', 'ssame', 'tcf', 'paixão', 'paigon', "stealin'", 'conversating', 'escuchar', "'tll", 'jespère', 'scally', 'trut', 'leshurr', 'kaşar', 'dugni', 'crabtree', 'earings', "tweakin'", 'sixtythousand', 'ruh', 'eenie', 'nickelette', 'onehunnid', '你真該看看我戴上后冠', 'adlib', 'trampa', 'هام', 'aaagain', 'voicemail', "comin'", "shot'll", 'dnice', 'chingy', "huffin'", 'miuccia', "samplin'", 'tampoco', 'hunh', 'unleveled', 'saklıcam', 'upadło', "hoggin'", "travellin'", 'duyduğumuz', 'alt', 'курю', '一個接一個', 'esham', 'bunji', 'pompompompom', 'rrraahhhhhh', 'yerlerde', 'matress', 'appétit', 'neumann', 'rzuciłbym', 'ble', "doubtin'", "quelqu'un", "cru's", "evenin'", 'fouryearold', 'çev

In [39]:
new_df

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[ari, 'cause, pre, fuckin', 'bout, gon', havin..."
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"[settin', atm, pre, redbottoms, lookin', gon',..."
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[feelin', pre, tellin', onetrack, uniwhen]"
3,Ariana Grande,side to side,ariana grande nicki minaj i've been here all n...,"[grande, pre, 'bout, nothin', minime, feelin',..."
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i want to be ...,"[comin', ohyeah, pre, livin', turnin', lovin',..."
...,...,...,...,...
5744,Taylor Swift,teardrops on my guitar (live from clear channe...,drew looks at me i fake a smile so he won't se...,"[whos, pre, 'cause]"
5745,Taylor Swift,evermore [forward],to put it plainly we just couldnt stop writing...,"[ive, oneoff, couldnt, folklorian, imaginaryno..."
5746,Taylor Swift,welcome back grunwald,turn wycd on you're on your grunwald back from...,"[grunwald's, wycd, grunwald]"
5747,Taylor Swift,tolerate it (polskie tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową po...,"[cię, moje, jak, pobłażliwie, wszystko, błyszc..."


In [40]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [41]:
import re
from spellchecker import SpellChecker
from collections import Counter
from nltk.corpus import words
import nltk

In [42]:
nltk.download('all')
spell = SpellChecker()
english_words = set(words.words())

def is_contraction(word):
    patterns = [r".*in'$", r".*n'$", r"^ain’t$", r".*’re$", r".*’ll$", r".*’t$"]
    return any(re.match(pattern, word.lower()) for pattern in patterns)

def is_keep_word(word):
    patterns = [
        r"^(yee|woo|ayy|ooh|uhh|ha|brr|skrrt|la|oh|woah)+$",  # Ad-libs and repeated syllables
        r".*(.)\1{2,}.*",  # Repeated letters (e.g., skrrrt, yesss)
        r"^[a-z]+(z|s){2,}$"  # Words ending in multiple z/s (e.g., buzzz)
    ]
    return any(re.match(pattern, word.lower()) for pattern in patterns)

def is_non_english(word):
    return bool(re.search(r'[^\x00-\x7F]', word))  # Non-ASCII characters

def has_numbers(word):
    return bool(re.search(r'\d', word.lower())) 

def build_valid_words(texts, min_freq=5):
    all_words = []
    for text in texts:
        if isinstance(text, str):
            all_words.extend(text.split())
    word_counts = Counter(all_words)
    valid_words = set(word.lower() for word, count in word_counts.items()
                      if count >= min_freq and not has_numbers(word) and
                      (is_contraction(word) or is_keep_word(word)))
    valid_words.update(word for word in english_words if not has_numbers(word))
    return valid_words

def build_valid_words(texts, min_freq=5):
    all_words = []
    for text in texts:
        if isinstance(text, str):
            all_words.extend(text.split())
    word_counts = Counter(all_words)
    valid_words = set(word.lower() for word, count in word_counts.items()
                      if count >= min_freq and not has_numbers(word) and
                      (is_contraction(word) or is_keep_word(word)))
    valid_words.update(word for word in english_words if not has_numbers(word))
    return valid_words

def preprocess_text(text, valid_words):
    if not isinstance(text, str):
        return '', [], []
    words = text.split()
    processed_words = []
    word_lengths = []
    corrected_words = []
    misspelled = (
        'rumplestilskin', 'přijde', 'crunked', 'foreverholdagrudge', 'sneezin',
        'seeeeat', 'boogieoogieoogie', 'therealyou', 'ventin',
        'hahahah', 'mighta', 'mulsanne', 'chillin', 'buttdial', 'bimmer',
        'flyest', 'bleh', 'puuuuuuuke', 'rollup', 'rosecolored'
    )
    for word in words:
        word_lower = word.lower()
        if has_numbers(word):
            corrected_words.append(f"{word}→Removed(Number)")
            continue
        if word_lower in misspelled:
            corrected_words.append(f"{word}→Removed(Misspelled)")
            continue
        if word_lower in valid_words or is_contraction(word) or is_keep_word(word):
            processed_words.append(word)
            corrected_words.append(f"{word}→None")
        elif is_non_english(word):
            corrected_words.append(f"{word}→Removed(Non-English)")
            continue
        else:
            correction = spell.correction(word)
            if correction and not has_numbers(correction) and correction in english_words:
                processed_words.append(correction)
                corrected_words.append(f"{word}→{correction}")
            else:
                processed_words.append(word)
                corrected_words.append(f"{word}→None")
    return ' '.join(processed_words), word_lengths, corrected_words


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\LENOVO\AppData\Roaming\nltk_data

In [43]:
texts = ["I'm playin' feetball स्क्रिप्ट विशेषता rumplestilskin chillin 6ty in my bimmer, yesss sir!", "He interupting the show like rumplestilskin mispelled"]
valid_words = build_valid_words(texts)

for text in texts:
    cleaned,_, corrections = preprocess_text(text, valid_words)
    print("Cleaned Text:", cleaned)
    print("Corrections:", corrections)

Cleaned Text: I'm playin' football in my simmer yesss sir
Corrections: ["I'm→None", "playin'→None", 'feetball→football', 'स्क्रिप्ट→Removed(Non-English)', 'विशेषता→Removed(Non-English)', 'rumplestilskin→Removed(Misspelled)', 'chillin→Removed(Misspelled)', '6ty→Removed(Number)', 'in→None', 'my→None', 'bimmer,→simmer', 'yesss→None', 'sir!→sir']
Cleaned Text: He interrupting the show like mispelled
Corrections: ['He→None', 'interupting→interrupting', 'the→None', 'show→None', 'like→None', 'rumplestilskin→Removed(Misspelled)', 'mispelled→None']


In [44]:
valid_words = build_valid_words(new_df['Lyric'], min_freq=5)


In [45]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [46]:
# from tqdm import tqdm
# tqdm.pandas()

# new_df['Lyric_Filtered'] = new_df['Lyric'].progress_apply(lambda x: preprocess_text(x, valid_words)[0])


from tqdm import tqdm
tqdm.pandas()  # for progress bar

def process_if_misspelled(row):
    if row['Misspelled_Words']:  # if not an empty list
        return preprocess_text(row['Lyric'], valid_words)[0]  # filtered text only
    else:
        return row['Lyric']  # leave it unchanged

# Apply with progress bar and axis=1 for row-wise operation
new_df['Lyric_Filtered'] = new_df.progress_apply(process_if_misspelled, axis=1)


  0%|          | 0/5711 [00:00<?, ?it/s]

100%|██████████| 5711/5711 [3:32:46<00:00,  2.24s/it]   


In [47]:
print(new_df.columns)


Index(['Artist', 'Title', 'Lyric', 'Misspelled_Words', 'Lyric_Filtered'], dtype='object')


In [81]:
print(new_df.columns)

Index(['Lyric', 'Lyric_Preprocessed', 'Lyric_Filtered', 'Word_Lengths',
       'Corrections'],
      dtype='object')


### Export into new CSV file

In [48]:
new_df.to_csv(r"E:\Lyrics_Generator\Data_Sets\filtered_songs.csv", index=False)

print("Filtered lyrics saved to 'filtered_songs.csv' successfully!")

Filtered lyrics saved to 'filtered_songs.csv' successfully!


### Loading the filtered DataSet

In [50]:
filtered_df = pd.read_csv(r"E:\Lyrics_Generator\Data_Sets\filtered_songs.csv")

filtered_df.head(3)

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words,Lyric_Filtered
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"['ari', ""'cause"", 'pre', ""fuckin'"", ""'bout"", ""...",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"[""settin'"", 'atm', 'pre', 'redbottoms', ""looki...",yeah breakfast at tiffany and bottles of bubbl...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[""feelin'"", 'pre', ""tellin'"", 'onetrack', 'uni...",you you love it how i move you you love it how...


In [52]:
filtered_df.shape

(5711, 5)

In [53]:
filtered_df.isnull().sum()

Artist              0
Title               0
Lyric               0
Misspelled_Words    0
Lyric_Filtered      1
dtype: int64

In [54]:
null_row = filtered_df[filtered_df['Lyric_Filtered'].isnull()]
print(null_row)


      Artist                                   Title  \
2865  Eminem  premonition (intro) (kurdish subtitle)   

                                                  Lyric  \
2865  ئینتڕۆ ئێمینێم هەڵیبێنم ئەمە چیە ها وابزانم ڕو...   

                                       Misspelled_Words Lyric_Filtered  
2865  ['ئینتڕۆ', 'نەمان', 'نیم', 'بەردەوامبین', 'شێو...            NaN  


In [55]:
preprocess_text(filtered_df.loc[2865, 'Lyric'], valid_words)


('',
 [],
 ['ئینتڕۆ→Removed(Non-English)',
  'ئێمینێم→Removed(Non-English)',
  'هەڵیبێنم→Removed(Non-English)',
  'ئەمە→Removed(Non-English)',
  'چیە→Removed(Non-English)',
  'ها→Removed(Non-English)',
  'وابزانم→Removed(Non-English)',
  'ڕوون→Removed(Non-English)',
  'دیارە→Removed(Non-English)',
  'ئێمە→Removed(Non-English)',
  'هەرگیز→Removed(Non-English)',
  'ڕوبەڕووی→Removed(Non-English)',
  'یەکدی→Removed(Non-English)',
  'نابینەوە→Removed(Non-English)',
  'بەڵام→Removed(Non-English)',
  'ئەمە→Removed(Non-English)',
  'هەزەلیە→Removed(Non-English)',
  'چەنە→Removed(Non-English)',
  'ڕقم→Removed(Non-English)',
  'لێتە→Removed(Non-English)',
  'پێویستم→Removed(Non-English)',
  'پێتە→Removed(Non-English)',
  'ئەمە→Removed(Non-English)',
  'مۆسیقای→Removed(Non-English)',
  'بکوژە→Removed(Non-English)',
  'هووک→Removed(Non-English)',
  'نیکی→Removed(Non-English)',
  'گرایەر→Removed(Non-English)',
  'دڵم→Removed(Non-English)',
  'خەبەری→Removed(Non-English)',
  'داوە→Removed(Non-Englis

In [62]:
filtered_df.isnull().sum()

Artist              0
Title               0
Lyric               0
Misspelled_Words    0
Lyric_Filtered      1
dtype: int64

In [63]:
filtered_df = filtered_df.dropna(subset=['Lyric_Filtered'])


In [64]:
filtered_df.isnull().sum()

Artist              0
Title               0
Lyric               0
Misspelled_Words    0
Lyric_Filtered      0
dtype: int64

In [66]:
filtered_df.shape

(5710, 5)

## Feature Engineering

In [None]:
# # Creating a mapping dictionary from artist to genre
# genre_map = {
#     'Eminem': 'Hip Hop / Rap',
#     'Taylor Swift': 'Pop / Country / Synthpop',
#     'Drake': 'Hip Hop / Rap / R&B',
#     'Beyoncé': 'R&B / Pop / Hip Hop',
#     'Rihanna': 'Pop / R&B / Dancehall',
#     'Lady Gaga': 'Pop / Dance / Electronic',
#     'Justin Bieber': 'Pop / R&B',
#     'Coldplay': 'Alternative Rock / Pop Rock',
#     'Katy Perry': 'Pop',
#     'Nicki Minaj': 'Hip Hop / Rap / Pop',
#     'Ariana Grande': 'Pop / R&B',
#     'Ed Sheeran': 'Pop / Folk Pop',
#     'BTS (방탄소년단)': 'K-pop / Pop / Hip Hop',
#     'Dua Lipa': 'Pop / Dance / Disco',
#     'Maroon 5': 'Pop Rock / Funk Pop',
#     'Selena Gomez': 'Pop / Dance Pop',
#     'Post Malone': 'Hip Hop / Pop / Trap',
#     'Billie Eilish': 'Alternative / Pop / Electropop',
#     'Charlie Puth': 'Pop / R&B',
#     'Cardi B': 'Hip Hop / Rap',
#     'Khalid': 'R&B / Pop / Soul'
# }

In [None]:
# Adding the new 'Genre' column based on the mapping

# df['Genre'] = df['Artist'].map(genre_map)

In [None]:
# df.head()

Unnamed: 0,Artist,Title,Lyric,Genre
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,Pop / R&B
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,Pop / R&B
2,Ariana Grande,​God is a woman,you you love it how i move you you love it how...,Pop / R&B
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...,Pop / R&B
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...,Pop / R&B


In [None]:
# df['Genre'].isnull().sum()

0

In [None]:
# df['Genre'].value_counts()

Genre
Pop / R&B                         730
Hip Hop / Rap                     596
Pop / Country / Synthpop          477
Hip Hop / Rap / R&B               464
R&B / Pop / Hip Hop               406
Pop / R&B / Dancehall             397
Pop / Dance / Electronic          395
Alternative Rock / Pop Rock       333
Pop                               324
Hip Hop / Rap / Pop               321
Pop / Folk Pop                    294
K-pop / Pop / Hip Hop             270
Pop / Dance / Disco               246
Pop Rock / Funk Pop               197
Pop / Dance Pop                   174
Hip Hop / Pop / Trap              148
Alternative / Pop / Electropop    145
R&B / Pop / Soul                   64
Name: count, dtype: int64

In [None]:
# df['Genre'].unique()

array(['Pop / R&B', 'R&B / Pop / Hip Hop',
       'Alternative / Pop / Electropop', 'K-pop / Pop / Hip Hop',
       'Hip Hop / Rap', 'Alternative Rock / Pop Rock',
       'Hip Hop / Rap / R&B', 'Pop / Dance / Disco', 'Pop / Folk Pop',
       'Pop', 'R&B / Pop / Soul', 'Pop / Dance / Electronic',
       'Pop Rock / Funk Pop', 'Hip Hop / Rap / Pop',
       'Hip Hop / Pop / Trap', 'Pop / R&B / Dancehall', 'Pop / Dance Pop',
       'Pop / Country / Synthpop'], dtype=object)