# Lyrics Generator

In [95]:
import os
import pandas as pd

In [96]:

# Define input and output paths
folder_path = r"E:\Lyrics_Generator\Data_Sets\Songs"
output_folder = r"E:\Lyrics_Generator\Data_Sets"
output_file = os.path.join(output_folder, "merged_songs.csv")

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Read and merge
dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
merged_df = pd.concat(dfs, ignore_index=True)

# Save to CSV
merged_df.to_csv(output_file, index=False)
print(f"Merged {len(csv_files)} CSV files into '{output_file}' successfully!")

# folder_path = "E:\AI\Lyrics_Generator_RNN\Song_dataset"

# csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
# merged_df = pd.concat(dfs, ignore_index=True)

# merged_df.to_csv(r"E:\AI\Lyrics_Generator_RNN\Data_sets\merged_songs.csv", index=False)

# print(f"Merged {len(csv_files)} CSV files into 'merged_songs.csv' successfully!")


Merged 20 CSV files into 'E:\Lyrics_Generator\Data_Sets\merged_songs.csv' successfully!


## Loading the DataSets

In [97]:
df = pd.read_csv(r"E:\Lyrics_Generator\Data_sets\merged_songs.csv")

In [98]:
df.head(3)

Unnamed: 0.1,Artist,Title,Album,Date,Lyric,Year,Unnamed: 0
0,Ariana Grande,"​thank u, next","thank u, next",2018-11-03,thought i'd end up with sean but he wasn't a m...,2018.0,
1,Ariana Grande,7 rings,"thank u, next",2019-01-18,yeah breakfast at tiffany's and bottles of bub...,2019.0,
2,Ariana Grande,​God is a woman,Sweetener,2018-07-13,you you love it how i move you you love it how...,2018.0,


## Exploring the data

In [99]:
df['Artist'].value_counts()

Artist
Eminem           521
Taylor Swift     479
Drake            466
Beyoncé          406
Rihanna          405
Lady Gaga        402
Justin Bieber    348
Coldplay         344
Katy Perry       325
Nicki Minaj      323
Ariana Grande    308
Ed Sheeran       296
Dua Lipa         247
Maroon 5         197
Selena Gomez     175
Post Malone      148
Billie Eilish    145
Charlie Puth      75
Cardi B           75
Khalid            64
Name: count, dtype: int64

In [100]:
df.shape

(5749, 7)

## Checking for the missing data

In [101]:
df.isnull().sum()

Artist           0
Title            0
Album         1552
Date          1744
Lyric           38
Year          1744
Unnamed: 0     308
dtype: int64

In [102]:
df = df.drop(columns=['Album', 'Date', 'Year', 'Unnamed: 0'])

In [103]:
df.isnull().sum()

Artist     0
Title      0
Lyric     38
dtype: int64

In [104]:
df = df.dropna(subset=['Lyric'])  # Remove rows with missing lyrics
df = df.drop_duplicates()  # Remove duplicate lyrics

In [105]:
df.isnull().sum()

Artist    0
Title     0
Lyric     0
dtype: int64

In [106]:
df.shape

(5711, 3)

In [107]:
df['Artist'].value_counts()

Artist
Eminem           521
Taylor Swift     477
Drake            464
Beyoncé          406
Rihanna          397
Lady Gaga        395
Justin Bieber    347
Coldplay         333
Katy Perry       324
Nicki Minaj      321
Ariana Grande    308
Ed Sheeran       294
Dua Lipa         246
Maroon 5         197
Selena Gomez     174
Post Malone      148
Billie Eilish    145
Charlie Puth      75
Cardi B           75
Khalid            64
Name: count, dtype: int64

## Text Preprocessing

In [108]:
new_df = df.copy()

In [109]:
new_df

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​God is a woman,you you love it how i move you you love it how...
3,Ariana Grande,Side To Side,ariana grande nicki minaj i've been here all ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i wanna be in...
...,...,...,...
5744,Taylor Swift,Teardrops on my Guitar (Live from Clear Channe...,drew looks at me i fake a smile so he won't se...
5745,Taylor Swift,Evermore [Forward],to put it plainly we just couldnt stop writing...
5746,Taylor Swift,Welcome Back Grunwald,turn wycd on you're on your grunwald back from...
5747,Taylor Swift,Tolerate it (Polskie Tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową p...


### Lowercasing

In [110]:
new_df['Title'] = new_df['Title'].str.lower()
new_df['Lyric'] = new_df['Lyric'].str.lower()

In [111]:
new_df.head(3)

Unnamed: 0,Artist,Title,Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...


#### Cleaning functions

In [112]:
import re

In [113]:
# Pre-checking the lyrics
def has_html_tags(text):
    """Check if text contains HTML tags."""
    if not isinstance(text, str):
        return False
    return bool(re.search(r'<[^>]+>', text))

def has_urls(text):
    """Check if text contains URLs."""
    if not isinstance(text, str):
        return False
    url_pattern = r'https?://\S+|www\.\S+|\S+\.\S+/\S+'
    return bool(re.search(url_pattern, text))

# def has_punctuation(text):
#     """Check if text contains punctuation."""
#     if not isinstance(text, str):
#         return False
#     return any(c in string.punctuation for c in text)

In [114]:
# Function to remove html tags and urls
def remove_html_tags(text):
    """Remove HTML tags from text."""
    if not isinstance(text, str):
        return text
    pattern = re.compile(r'<.*?>')
    return pattern.sub('', text).strip()

def remove_urls(text):
    """Remove URLs from text."""
    if not isinstance(text, str):
        return text
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text).strip()

# def remove_punctuation(text):
#     """Remove punctuation from text."""
#     if not isinstance(text, str):
#         return text
#     return text.translate(str.maketrans('', '', string.punctuation))

In [115]:
lyric_html_count = new_df['Lyric'].apply(has_html_tags).sum()
lyric_url_count = new_df['Lyric'].apply(has_urls).sum()
# lyric_punctuation_count = new_df['Lyric'].apply(has_punctuation).sum()

# title_html_count = new_df['Title'].apply(has_html_tags).sum()
# title_url_count = new_df['Title'].apply(has_urls).sum()
# title_punctuation_count = new_df['Title'].apply(has_punctuation).sum()

total_rows = len(new_df)
print("Precheck Summary:")
print(f"Lyrics with HTML tags: {lyric_html_count} ({(lyric_html_count/total_rows)*100:.2f}%)")
print(f"Lyrics with URLs: {lyric_url_count} ({(lyric_url_count/total_rows)*100:.2f}%)")
# print(f"Lyrics with punctuation: {lyric_punctuation_count} ({(lyric_punctuation_count/total_rows)*100:.2f}%)")

# print(f"Titles with HTML tags: {title_html_count} ({(title_html_count/total_rows)*100:.2f}%)")
# print(f"Titles with URLs: {title_url_count} ({(title_url_count/total_rows)*100:.2f}%)")
# print(f"Titles with punctuation: {title_punctuation_count} ({(title_punctuation_count/total_rows)*100:.2f}%)")

Precheck Summary:
Lyrics with HTML tags: 0 (0.00%)
Lyrics with URLs: 0 (0.00%)


In [116]:
# Cleaning based on pre-check results

if lyric_html_count > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(remove_html_tags)

if lyric_url_count > 0:
    new_df['Lyric'] = new_df['Lyric'].apply(remove_urls)
    
# if lyric_punctuation_count > 0:
#     new_df['Lyric'] = new_df['Lyric'].apply(remove_punctuation)

In [117]:
# Recalculate after cleaning
lyric_html_count_after = new_df['Lyric'].apply(has_html_tags).sum()
lyric_url_count_after = new_df['Lyric'].apply(has_urls).sum()
# lyric_punctuation_count_after = new_df['Lyric'].apply(has_punctuation).sum()

print("\nPost-cleaning Result:")
print(f"Lyrics with HTML tags: {lyric_html_count_after} ({(lyric_html_count_after/total_rows)*100:.2f}%)")
print(f"Lyrics with URLs: {lyric_url_count_after} ({(lyric_url_count_after/total_rows)*100:.2f}%)")
# print(f"Lyrics with punctuation: {lyric_punctuation_count_after} ({(lyric_punctuation_count_after/total_rows)*100:.2f}%)")



Post-cleaning Result:
Lyrics with HTML tags: 0 (0.00%)
Lyrics with URLs: 0 (0.00%)


In [118]:
new_df['Lyric'].head()

0    thought i'd end up with sean but he wasn't a m...
1    yeah breakfast at tiffany's and bottles of bub...
2    you you love it how i move you you love it how...
3    ariana grande  nicki minaj i've been here all ...
4    right now i'm in a state of mind i wanna be in...
Name: Lyric, dtype: object

## Chat Words Treatment

In [119]:
chat_words ={
    "u": "you",
    "ur": "your",
    "r": "are",
    "ya": "you",
    "luv": "love",
    "b4": "before",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "im": "i am",
    "dont": "don't",
    "idk": "i don't know",
    "brb": "be right back",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "ttyl": "talk to you later",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "bff": "best friends forever",
    "ily": "i love you",
    "jk": "just kidding",
    "bday": "birthday",
    "cuz": "because",
    "coz": "because",
    "thx": "thanks",
    "pls": "please",
    "plz": "please",
    "bae": "before anyone else",
    "dm": "direct message",
    "noob": "beginner",
    "sk8": "skate",
    "gr8": "great",
    "np": "no problem",
    "l8r": "later",
    "idc": "i don't care",
    "zzz": "sleeping",
    "omw": "on my way"
}

# Checking the number of lyrics containing the chat words

def contains_chat_words(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    return any(word in chat_words for word in words)

# Check how many lyrics contain chat words
chat_word_count = new_df['Lyric'].apply(contains_chat_words).sum()
print(f"Number of lyrics containing chat words: {chat_word_count}")

# Removing the chat words from the lyrics

def chat_word_treatment(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    normalized_words = [chat_words.get(w.lower(), w) for w in words]
    return ' '.join(normalized_words)

# Apply to the lyric column
new_df['Lyric'] = new_df['Lyric'].apply(chat_word_treatment)

chat_word_count_after = new_df['Lyric'].apply(contains_chat_words).sum()
print(f"Number of lyrics containing chat words: {chat_word_count_after}")

Number of lyrics containing chat words: 3374
Number of lyrics containing chat words: 0


## Spelling Correction

In [120]:
pip install pyspellchecker

Note: you may need to restart the kernel to use updated packages.


In [121]:
# from spellchecker import SpellChecker

# spell = SpellChecker()

# def count_misspelled_words(text):
#     if not isinstance(text, str):
#         return 0
#     words = text.split()
#     misspelled = spell.unknown(words)
#     return len(misspelled)

# total_misspelled_words = new_df['Lyric'].apply(count_misspelled_words).sum()
# print(f"Total misspelled words: {total_misspelled_words}")

# # def correct_spelling(text):
# #     if not isinstance(text, str):
# #         return text
# #     corrected_words = []
# #     for word in text.split():
# #         corrected = spell.correction(word)
# #         corrected_words.append(corrected if corrected else word)
# #     return ' '.join(corrected_words)

from spellchecker import SpellChecker

spell = SpellChecker()

# Function to return misspelled words in a text
def get_misspelled_words(text):
    if not isinstance(text, str):
        return []
    words = text.split()
    return list(spell.unknown(words))

# # create a new column with misspelled words
# new_df['Misspelled_Words'] = new_df['Lyric'].apply(get_misspelled_words)

# # Total misspelled words count
# total_misspelled_words = new_df['Misspelled_Words'].apply(len).sum()
# print(f"Total misspelled words: {total_misspelled_words}")

# # View the misspelled words only (unique across dataset)
# from itertools import chain

# all_misspelled = list(chain.from_iterable(new_df['Misspelled_Words']))
# unique_misspelled = set(all_misspelled)

# print(f"Unique misspelled words ({len(unique_misspelled)}):")
# print(unique_misspelled)

# # View the first 10 rows with their misspelled words
# print(new_df[['Misspelled_Words']].head(10))

misspelled_words = new_df['Lyric'].apply(get_misspelled_words)
total_misspelled_words = misspelled_words.apply(len).sum()
print(f"Total misspelled words: {total_misspelled_words}")

# Unique misspelled words
unique_misspelled = set(misspelled_words.explode().dropna().tolist())
print(f"Unique misspelled words ({len(unique_misspelled)}):")
print(unique_misspelled)  

new_df['Misspelled_Words'] = new_df['Lyric'].apply(get_misspelled_words)
  


Total misspelled words: 59049
Unique misspelled words (19201):
{'woahvery', 'انجام', 'courtnee', 'aahah', 'sube', 'areba', 'téged', 'sigo', "''merhaba", "adat's", 'whiiite', 'espalda', 'posie', "leavin'", 'ریشه', 'uglyomarosa', 'twopiecing', 'nerede', 'sanırım', 'snowmangroundup', 'sstation', 'ingerült', 'crosseyed', '900msminaj', 'breal', 'mrchows', 'lyckety', "dodgin'", 'asswhooping', "man'll", 'ouuuu', 'rel', 'soundin', '0berzerk', 'نفره', 'scarface', 'givenchy', 'aalegra', 'creamofthecrop', 'ju', 'دیپلماتیکم', "testin'", 'prérefrain', 'ballers', 'mamy', 'sobre', 'reat', 'sustituirte', 'lameo', 'rumours', 'youngass', 'lluminati', 'وجود', 'głowie', 'megtaláltam', 'cowell', 'bőrszíne', 'منظورش', 'wap', 'bei', 'itmug', 'aightuhslim', 'rhymesayer', "'nelly", 'sikeyim', "j'appartiens", 'ohohwhoa', 'favre', "'cross", 'saudy', 'dresdenben', 'parisien', 'hooooh', 'mikrofon', 'vocoder', 'diz', 'hoohoh', 'excita', 'drugą', 'canadabis', 'sile', 'akzel', 'ringtone', 'onenight', 'vzdát', 'nonman

In [122]:
new_df

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[nothin', havin', ari, pre, 'bout, yee, fuckin..."
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"[redbottoms, tiffany's, gleamin', lookin', pre..."
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[feelin', pre, uniwhen, onetrack, tellin']"
3,Ariana Grande,side to side,ariana grande nicki minaj i've been here all n...,"[feelin', minime, minaj, 'em, makin', ariana, ..."
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i want to be ...,"[turnin', comin', pre, pickin', lovin', livin'..."
...,...,...,...,...
5744,Taylor Swift,teardrops on my guitar (live from clear channe...,drew looks at me i fake a smile so he won't se...,"[pre, 'cause, whos]"
5745,Taylor Swift,evermore [forward],to put it plainly we just couldnt stop writing...,"[folklorian, couldnt, oneoff, imaginarynot, iv..."
5746,Taylor Swift,welcome back grunwald,turn wycd on you're on your grunwald back from...,"[grunwald, grunwald's, wycd]"
5747,Taylor Swift,tolerate it (polskie tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową po...,"[jeśli, jak, czekam, cię, głową, kolczasty, si..."


In [123]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [124]:
import re
from spellchecker import SpellChecker
from collections import Counter
from nltk.corpus import words
import nltk

In [125]:
# nltk.download('all')
# spell = SpellChecker()
# english_words = set(words.words())

# def is_contraction(word):
#     patterns = [r".*in'$", r".*n'$", r"^ain’t$", r".*’re$", r".*’ll$", r".*’t$"]
#     return any(re.match(pattern, word.lower()) for pattern in patterns)

# def is_keep_word(word):
#     patterns = [
#         r"^(yee|woo|ayy|ooh|uhh|ha|brr|skrrt|la|oh|woah)+$",  # Ad-libs and repeated syllables
#         # r".*(.)\1{2,}.*",  # Repeated letters (e.g., skrrrt, yesss)
#         # r"^[a-z]+(z|s){2,}$"  # Words ending in multiple z/s (e.g., buzzz)
#     ]
#     return any(re.match(pattern, word.lower()) for pattern in patterns)

# def is_non_english(word):
#     return bool(re.search(r'[^\x00-\x7F]', word))  # Non-ASCII characters

# def has_numbers(word):
#     return bool(re.search(r'\d', word.lower())) 

# def build_valid_words(texts, min_freq=5):
#     all_words = []
#     for text in texts:
#         if isinstance(text, str):
#             all_words.extend(text.split())
#     word_counts = Counter(all_words)
#     valid_words = set(word.lower() for word, count in word_counts.items()
#                       if count >= min_freq and not has_numbers(word) and
#                       (is_contraction(word) or is_keep_word(word)))
#     valid_words.update(word for word in english_words if not has_numbers(word))
#     return valid_words

# def build_valid_words(texts, min_freq=5):
#     all_words = []
#     for text in texts:
#         if isinstance(text, str):
#             all_words.extend(text.split())
#     word_counts = Counter(all_words)
#     valid_words = set(word.lower() for word, count in word_counts.items()
#                       if count >= min_freq and not has_numbers(word) and
#                       (is_contraction(word) or is_keep_word(word)))
#     valid_words.update(word for word in english_words if not has_numbers(word))
#     return valid_words

# def preprocess_text(text, valid_words):
#     if not isinstance(text, str):
#         return '', [], []
#     words = text.split()
#     processed_words = []
#     word_lengths = []
#     corrected_words = []
#     misspelled = (
#         'rumplestilskin', 'přijde', 'crunked', 'foreverholdagrudge', 'sneezin',
#         'seeeeat', 'boogieoogieoogie', 'therealyou', 'ventin',
#         'hahahah', 'mighta', 'mulsanne', 'chillin', 'buttdial', 'bimmer',
#         'flyest', 'bleh', 'puuuuuuuke', 'rollup', 'rosecolored'
#     )
#     for word in words:
#         word_lower = word.lower()
#         if has_numbers(word):
#             corrected_words.append(f"{word}→Removed(Number)")
#             continue
#         if word_lower in misspelled:
#             corrected_words.append(f"{word}→Removed(Misspelled)")
#             continue
#         if word_lower in valid_words or is_contraction(word) or is_keep_word(word):
#             processed_words.append(word)
#             corrected_words.append(f"{word}→None")
#         elif is_non_english(word):
#             corrected_words.append(f"{word}→Removed(Non-English)")
#             continue
#         else:
#             correction = spell.correction(word)
#             if correction and not has_numbers(correction) and correction in english_words:
#                 processed_words.append(correction)
#                 corrected_words.append(f"{word}→{correction}")
#             else:
#                 processed_words.append(word)
#                 corrected_words.append(f"{word}→None")
#     return ' '.join(processed_words), word_lengths, corrected_words


In [126]:
import re
from collections import Counter
from spellchecker import SpellChecker
from nltk.corpus import words
import nltk

# Make sure you have downloaded required nltk packages:
nltk.download('words')

spell = SpellChecker()
english_words = set(words.words())

def is_contraction(word):
    patterns = [r".*in'$", r".*n'$", r"^ain’t$", r".*’re$", r".*’ll$", r".*’t$"]
    return any(re.match(pattern, word.lower()) for pattern in patterns)

def is_keep_word(word):
    patterns = [
        r"^(yee|woo|ayy|ooh|uhh|ha|brr|skrrt|la|oh|woah)+$",  # Ad-libs and repeated syllables
        # Uncomment below if you want to allow repeated letters or endings with multiple z/s
        # r".*(.)\1{2,}.*",  # Repeated letters (e.g., skrrrt, yesss)
        # r"^[a-z]+(z|s){2,}$"  # Words ending in multiple z/s (e.g., buzzz)
    ]
    return any(re.match(pattern, word.lower()) for pattern in patterns)

def is_non_english(word):
    return bool(re.search(r'[^\x00-\x7F]', word))  # Contains non-ASCII

def has_numbers(word):
    return bool(re.search(r'\d', word))

def build_valid_words(texts, min_freq=5):
    all_words = []
    for text in texts:
        if isinstance(text, str):
            all_words.extend(text.split())
    word_counts = Counter(all_words)
    valid_words = set(word.lower() for word, count in word_counts.items()
                      if count >= min_freq and not has_numbers(word) and
                      (is_contraction(word) or is_keep_word(word)))
    valid_words.update(word for word in english_words if not has_numbers(word))
    return valid_words

def preprocess_text(text, valid_words):
    if not isinstance(text, str):
        return '', [], []
    words_in_text = text.split()
    processed_words = []
    word_lengths = []
    corrected_words = []

    for word in words_in_text:
        word_lower = word.lower()
        if has_numbers(word):
            corrected_words.append(f"{word}→Removed(Number)")
            continue  # Remove words with numbers

        if is_non_english(word):
            corrected_words.append(f"{word}→Removed(Non-English)")
            continue  # Remove non-English words

        # Check spelling using SpellChecker
        misspelled_set = spell.unknown([word_lower])

        if word_lower in valid_words or is_contraction(word) or is_keep_word(word):
            # Word is valid or allowed contraction/slang
            processed_words.append(word)
            corrected_words.append(f"{word}→None")
        elif misspelled_set:
            # Word is misspelled - try to correct
            correction = spell.correction(word_lower)
            if correction and correction in english_words and not has_numbers(correction):
                processed_words.append(correction)
                corrected_words.append(f"{word}→{correction}")
            else:
                # Can't correct - remove word
                corrected_words.append(f"{word}→Removed(Misspelled)")
        else:
            # Word is spelled correctly but not in valid_words (rare)
            processed_words.append(word)
            corrected_words.append(f"{word}→None")

    return ' '.join(processed_words), word_lengths, corrected_words


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [127]:
texts = [
    "I nothin' ridin' ain't goooogle sevan books| gonna rumplestilskin', 'přijde', 'crunked', 'foreverholdagrudge'let this crunked night stop me!",
    "Yee skrrt ayy woo ooh ha brr",
    "This is a test oaky sentence with some स्क्रिप्ट विशेषता rumplestilskin chillin 6ty in my bimmer misspelled wrds and numbers 1234."
]

valid_words_test = build_valid_words(texts)

for t in texts:
    processed_text, lengths, corrections = preprocess_text(t, valid_words_test)
    print("Original:", t)
    print("Processed:", processed_text)
    print("Corrections:", corrections)
    print()

Original: I nothin' ridin' ain't goooogle sevan books| gonna rumplestilskin', 'přijde', 'crunked', 'foreverholdagrudge'let this crunked night stop me!
Processed: I nothin' ridin' ain't seven gonna this cranked night stop me
Corrections: ['I→None', "nothin'→None", "ridin'→None", "ain't→None", 'goooogle→Removed(Misspelled)', 'sevan→seven', 'books|→Removed(Misspelled)', 'gonna→None', "rumplestilskin',→Removed(Misspelled)", "'přijde',→Removed(Non-English)", "'crunked',→Removed(Misspelled)", "'foreverholdagrudge'let→Removed(Misspelled)", 'this→None', 'crunked→cranked', 'night→None', 'stop→None', 'me!→me']

Original: Yee skrrt ayy woo ooh ha brr
Processed: Yee skrrt ayy woo ooh ha brr
Corrections: ['Yee→None', 'skrrt→None', 'ayy→None', 'woo→None', 'ooh→None', 'ha→None', 'brr→None']

Original: This is a test oaky sentence with some स्क्रिप्ट विशेषता rumplestilskin chillin 6ty in my bimmer misspelled wrds and numbers 1234.
Processed: This is a test oaky sentence with some chilling in my simmer

In [128]:
valid_words = build_valid_words(new_df['Lyric'], min_freq=5)
print('done')

done


In [79]:
pip install tqdm




In [None]:
# from tqdm import tqdm
# tqdm.pandas()  # enable progress_apply with tqdm

# # Processing function that filters lyrics if misspelled words exist
# def process_if_misspelled(row):
#     if row['Misspelled_Words']:  # if list not empty
#         return preprocess_text(row['Lyric'], valid_words)[0]  # filtered text only
#     else:
#         return row['Lyric']  # no change


# new_df['Lyric_Filtered'] = new_df.progress_apply(process_if_misspelled, axis=1)
# new_df.to_pickle('processed_lyrics.pkl')  

# # On future runs, skip above lines and just load:
# # new_df = pd.read_pickle('processed_lyrics.pkl') 

100%|██████████| 5711/5711 [3:45:58<00:00,  2.37s/it]   


In [129]:
print(new_df.columns)


Index(['Artist', 'Title', 'Lyric', 'Misspelled_Words'], dtype='object')


In [130]:
new_df = pd.read_pickle('processed_lyrics.pkl') 

In [131]:
print(new_df.columns)


Index(['Artist', 'Title', 'Lyric', 'Misspelled_Words', 'Lyric_Filtered'], dtype='object')


### Export into new CSV file

In [132]:
new_df.to_csv(r"E:\Lyrics_Generator\Data_Sets\lyric_filtered.csv", index=False)

print("Filtered lyrics saved to 'filtered_songs.csv' successfully!")

Filtered lyrics saved to 'filtered_songs.csv' successfully!


### Loading the filtered DataSet

In [84]:
filtered_df = pd.read_csv(r"E:\Lyrics_Generator\Data_Sets\lyric_filtered.csv")

filtered_df.head(3)

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words,Lyric_Filtered
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[""nothin'"", ""havin'"", 'ari', 'pre', ""'bout"", '...",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"['redbottoms', ""tiffany's"", ""gleamin'"", ""looki...",yeah breakfast at tiffany and bottles of bubbl...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[""feelin'"", 'pre', 'uniwhen', 'onetrack', ""tel...",you you love it how i move you you love it how...


In [135]:
filtered_df.shape

(5709, 5)

In [136]:
filtered_df.isnull().sum()

Artist              0
Title               0
Lyric               0
Misspelled_Words    0
Lyric_Filtered      0
dtype: int64

In [137]:
# null_row = filtered_df[filtered_df['Lyric_Filtered'].isnull()]
# print(null_row)


In [138]:
## Drop the rows with null values in 'Lyric_Filtered'
filtered_df = filtered_df.dropna(subset=['Lyric_Filtered'])


In [139]:
filtered_df.isnull().sum()

Artist              0
Title               0
Lyric               0
Misspelled_Words    0
Lyric_Filtered      0
dtype: int64

In [140]:
filtered_df.shape

(5709, 5)

In [None]:
import re

# Function to detect words that are numbers or contain numbers
def has_numbers_in_words(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    for word in words:
        if word.isdigit() or re.search(r'\d', word):
            return True
    return False

contains_numbers = filtered_df['Lyric_Filtered'].apply(has_numbers_in_words)

# Check how many rows have such words
print(f"Rows with numeric words: {contains_numbers.sum()}")

numeric_rows = filtered_df[contains_numbers]
print(numeric_rows[['Lyric_Filtered']])


Rows with numeric words: 41
                                         Lyric_Filtered
92    refrain i'll love you till i die boy every day...
116   jamie is over and jamie is gone jamie's decide...
242                                      released in 09
379   i tried to change closed my mouth more tried t...
445   i'm so in love i'm still in love i never met l...
515   it's what you do it's what you see i know if i...
547   i always loved you even though you did me wron...
707   i tried to change closed my mouth more tried t...
756   lights out you're not here holding me i count ...
808   i want to be alone alone with you does that ma...
842   i want to be alone alone with you does that ma...
849   i want to be alone alone with you does that ma...
1143  one two three will i ran away from you that's ...
1152  it could be worse i could be alone i could be ...
1184  have yourself a merry little christmas let you...
1238  tracy died soon after a long fought civil war ...
1287  00 yellow 00 c

In [158]:
def remove_number_words(text):
    if not isinstance(text, str):
        return text
    return ' '.join(word for word in text.split() if not re.search(r'\d', word))

# Remove number-containing words only in those rows
filtered_df.loc[contains_numbers, 'Lyric_Filtered'] = filtered_df.loc[contains_numbers, 'Lyric_Filtered'].apply(remove_number_words)

print("Removed words containing numbers from Lyric_Filtered.")

Removed words containing numbers from Lyric_Filtered.


In [159]:
contains_numbers = filtered_df['Lyric_Filtered'].apply(has_numbers_in_words)

# Check how many rows have such words
print(f"Rows with numeric words: {contains_numbers.sum()}")

Rows with numeric words: 0


In [160]:
misspelled_words = filtered_df['Lyric_Filtered'].apply(get_misspelled_words)
total_misspelled_words = misspelled_words.apply(len).sum()
print(f"Total misspelled words: {total_misspelled_words}")



Total misspelled words: 17009


In [161]:
print(misspelled_words)

0                   [nothin', havin', yee, fuckin', gon']
1       [beamin', lookin', settin', poppin', watchin',...
2                                      [tellin', feelin']
3       [talkin', nothin', feelin', comin', makin', wa...
4       [turnin', comin', pickin', lovin', livin', rai...
                              ...                        
5706                                                   []
5707                                                   []
5708                                                   []
5709                                       [gra, za, sie]
5710                                                   []
Name: Lyric_Filtered, Length: 5709, dtype: object


In [162]:
filtered_df['Filtered_Misspelled_Words'] = misspelled_words

In [163]:
filtered_df

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words,Lyric_Filtered,Filtered_Misspelled_Words,Tokenized_Lyric,Joined_Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[""nothin'"", ""havin'"", 'ari', 'pre', ""'bout"", '...",thought i'd end up with sean but he wasn't a m...,"[nothin', havin', yee, fuckin', gon']","[thought, i'd, end, up, with, sean, but, he, w...",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"['redbottoms', ""tiffany's"", ""gleamin'"", ""looki...",yeah breakfast at tiffany and bottles of bubbl...,"[beamin', lookin', settin', poppin', watchin',...","[yeah, breakfast, at, tiffany, and, bottles, o...",yeah breakfast at tiffany and bottles of bubbl...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[""feelin'"", 'pre', 'uniwhen', 'onetrack', ""tel...",you you love it how i move you you love it how...,"[tellin', feelin']","[you, you, love, it, how, i, move, you, you, l...",you you love it how i move you you love it how...
3,Ariana Grande,side to side,ariana grande nicki minaj i've been here all n...,"[""feelin'"", 'minime', 'minaj', ""'em"", ""makin'""...",grand nick mina i've been here all night i've ...,"[talkin', nothin', feelin', comin', makin', wa...","[grand, nick, mina, i've, been, here, all, nig...",grand nick mina i've been here all night i've ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i want to be ...,"[""turnin'"", ""comin'"", 'pre', ""pickin'"", ""lovin...",right now i'm in a state of mind i want to be ...,"[turnin', comin', pickin', lovin', livin', rai...","[right, now, i'm, in, a, state, of, mind, i, w...",right now i'm in a state of mind i want to be ...
...,...,...,...,...,...,...,...,...
5706,Taylor Swift,teardrops on my guitar (live from clear channe...,drew looks at me i fake a smile so he won't se...,"['pre', ""'cause"", 'whos']",drew looks at me i fake a smile so he won't se...,[],"[drew, looks, at, me, i, fake, a, smile, so, h...",drew looks at me i fake a smile so he won't se...
5707,Taylor Swift,evermore [forward],to put it plainly we just couldnt stop writing...,"['folklorian', 'couldnt', 'oneoff', 'imaginary...",to put it plainly we just stop writing songs t...,[],"[to, put, it, plainly, we, just, stop, writing...",to put it plainly we just stop writing songs t...
5708,Taylor Swift,welcome back grunwald,turn wycd on you're on your grunwald back from...,"['grunwald', ""grunwald's"", 'wycd']",turn wynd on you're on your back from the amaz...,[],"[turn, wynd, on, you're, on, your, back, from,...",turn wynd on you're on your back from the amaz...
5709,Taylor Swift,tolerate it (polskie tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową po...,"['jeśli', 'jak', 'czekam', 'cię', 'głową', 'ko...",i jam z i jail z eczema i co i zero nice type ...,"[gra, za, sie]","[i, jam, z, i, jail, z, eczema, i, co, i, zero...",i jam z i jail z eczema i co i zero nice type ...


## Tokenization

In [184]:
# Custom Tokenization
import re

def custom_tokenize(text):
    # pattern = r"\b\w+(?:'\w+)?'?|\S"
    # pattern = r"\b\w+(?:'\w+)?'?|[.,!?']"
    pattern = r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?'?|[.,!?']"


    return re.findall(pattern, text)

filtered_df['Tokenized_Lyric'] = filtered_df['Lyric_Filtered'].apply(custom_tokenize)


In [185]:
filtered_df['Tokenized_Lyric']

0       [thought, i'd, end, up, with, sean, but, he, w...
1       [yeah, breakfast, at, tiffany, and, bottles, o...
2       [you, you, love, it, how, i, move, you, you, l...
3       [grand, nick, mina, i've, been, here, all, nig...
4       [right, now, i'm, in, a, state, of, mind, i, w...
                              ...                        
5706    [drew, looks, at, me, i, fake, a, smile, so, h...
5707    [to, put, it, plainly, we, just, stop, writing...
5708    [turn, wynd, on, you're, on, your, back, from,...
5709    [i, jam, z, i, jail, z, eczema, i, co, i, zero...
5710    [trying, just, like, they, say, just, taking, ...
Name: Tokenized_Lyric, Length: 5709, dtype: object

In [186]:
# test
text = "hi, how're you?!@#$%^&*()_+=//"
print(custom_tokenize(text))

['hi', ',', "how're", 'you', '?', '!']


In [197]:
filtered_df

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words,Lyric_Filtered,Filtered_Misspelled_Words,Tokenized_Lyric,Joined_Lyric
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[""nothin'"", ""havin'"", 'ari', 'pre', ""'bout"", '...",thought i'd end up with sean but he wasn't a m...,"[nothin', havin', yee, fuckin', gon']","[thought, i'd, end, up, with, sean, but, he, w...",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"['redbottoms', ""tiffany's"", ""gleamin'"", ""looki...",yeah breakfast at tiffany and bottles of bubbl...,"[beamin', lookin', settin', poppin', watchin',...","[yeah, breakfast, at, tiffany, and, bottles, o...",yeah breakfast at tiffany and bottles of bubbl...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[""feelin'"", 'pre', 'uniwhen', 'onetrack', ""tel...",you you love it how i move you you love it how...,"[tellin', feelin']","[you, you, love, it, how, i, move, you, you, l...",you you love it how i move you you love it how...
3,Ariana Grande,side to side,ariana grande nicki minaj i've been here all n...,"[""feelin'"", 'minime', 'minaj', ""'em"", ""makin'""...",grand nick mina i've been here all night i've ...,"[talkin', nothin', feelin', comin', makin', wa...","[grand, nick, mina, i've, been, here, all, nig...",grand nick mina i've been here all night i've ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i want to be ...,"[""turnin'"", ""comin'"", 'pre', ""pickin'"", ""lovin...",right now i'm in a state of mind i want to be ...,"[turnin', comin', pickin', lovin', livin', rai...","[right, now, i'm, in, a, state, of, mind, i, w...",right now i'm in a state of mind i want to be ...
...,...,...,...,...,...,...,...,...
5706,Taylor Swift,teardrops on my guitar (live from clear channe...,drew looks at me i fake a smile so he won't se...,"['pre', ""'cause"", 'whos']",drew looks at me i fake a smile so he won't se...,[],"[drew, looks, at, me, i, fake, a, smile, so, h...",drew looks at me i fake a smile so he won't se...
5707,Taylor Swift,evermore [forward],to put it plainly we just couldnt stop writing...,"['folklorian', 'couldnt', 'oneoff', 'imaginary...",to put it plainly we just stop writing songs t...,[],"[to, put, it, plainly, we, just, stop, writing...",to put it plainly we just stop writing songs t...
5708,Taylor Swift,welcome back grunwald,turn wycd on you're on your grunwald back from...,"['grunwald', ""grunwald's"", 'wycd']",turn wynd on you're on your back from the amaz...,[],"[turn, wynd, on, you're, on, your, back, from,...",turn wynd on you're on your back from the amaz...
5709,Taylor Swift,tolerate it (polskie tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową po...,"['jeśli', 'jak', 'czekam', 'cię', 'głową', 'ko...",i jam z i jail z eczema i co i zero nice type ...,"[gra, za, sie]","[i, jam, z, i, jail, z, eczema, i, co, i, zero...",i jam z i jail z eczema i co i zero nice type ...


In [188]:
# joining tokenized lyrics for vectorization
filtered_df['Joined_Lyric'] = filtered_df['Tokenized_Lyric'].apply(lambda tokens: ' '.join(tokens))
filtered_df['Joined_Lyric']

0       thought i'd end up with sean but he wasn't a m...
1       yeah breakfast at tiffany and bottles of bubbl...
2       you you love it how i move you you love it how...
3       grand nick mina i've been here all night i've ...
4       right now i'm in a state of mind i want to be ...
                              ...                        
5706    drew looks at me i fake a smile so he won't se...
5707    to put it plainly we just stop writing songs t...
5708    turn wynd on you're on your back from the amaz...
5709    i jam z i jail z eczema i co i zero nice type ...
5710    trying just like they say just taking the step...
Name: Joined_Lyric, Length: 5709, dtype: object

In [189]:
misspelled_in_joined = filtered_df['Joined_Lyric'].apply(get_misspelled_words)
print(misspelled_in_joined)
print(misspelled_in_joined.apply(len).sum())

0                   [nothin', havin', yee, fuckin', gon']
1       [beamin', lookin', settin', poppin', watchin',...
2                                      [tellin', feelin']
3       [talkin', nothin', feelin', comin', makin', wa...
4       [turnin', comin', pickin', lovin', livin', rai...
                              ...                        
5706                                                   []
5707                                                   []
5708                                                   []
5709                                       [gra, za, sie]
5710                                                   []
Name: Joined_Lyric, Length: 5709, dtype: object
17009


In [194]:
# Unique misspelled words
unique_misspelled = set(misspelled_in_joined.explode().dropna().tolist())
print(f"Unique misspelled words ({len(unique_misspelled)}):")
print(unique_misspelled)

Unique misspelled words (2081):
{'oohohoohohoh', "failin'", "billin'", 'muid', "leavin'", 'penetrator', "stealin'", "gangbangin'", "dodgin'", 'sok', 'rel', "winkin'", 'scarface', "nuthin'", "checkin'", 'tay', "flowin'", "testin'", "deepthroatin'", "chuckin'", 'coldhearted', "pennin'", 'wap', 'stoneman', "chalkin'", 'sile', 'naw', 'takedown', "hissin'", "referencin'", "evolvin'", 'cack', "jottin'", "stinkin'", "cacklin'", "hangin'", "dartin'", "hookin'", 'unevadable', "passin'", "rebuildin'", 'miseducation', 'alt', "whisperin'", 'stylo', "wonderin'", "guessin'", 'nebby', "pannin'", "seekin'", "impressin'", 'giornata', 'ohoh', "smokin'", "layin'", 'flingy', "mixin'", "puffin'", 'killas', "yappin'", "undressin'", "smckin'", "emceein'", "interceptin'", "jerkin'", "vibin'", "believin'", "buzzin'", "dwarfin'", 'sil', "backhandin'", 'hakeem', "satanworshippin'", "toppin'", "paintin'", 'ohohohohohohohohohohohohohohohoh', "conkin'", "snitchin'", "burstin'", "wifin'", "escapin'", "healin'", "det

## Lemmatization

In [217]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

def lemmatize_tokens_with_pos(token_list):
    pos_tags = pos_tag(token_list)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]


In [218]:
example_tokens = ['running', 'ran', 'eats', 'eating', 'cars', 'better', 'wolves', 'children']

lemmatized = lemmatize_tokens_with_pos(example_tokens)
print("Original:", example_tokens)
print("Lemmatized:", lemmatized)

Original: ['running', 'ran', 'eats', 'eating', 'cars', 'better', 'wolves', 'children']
Lemmatized: ['run', 'ran', 'eats', 'eat', 'car', 'well', 'wolf', 'child']


In [219]:
filtered_df['Lemmatized_Text'] = filtered_df['Lemmatized_Tokens'].apply(lambda tokens: ' '.join(tokens))

In [220]:
filtered_df

Unnamed: 0,Artist,Title,Lyric,Misspelled_Words,Lyric_Filtered,Filtered_Misspelled_Words,Tokenized_Lyric,Joined_Lyric,Lemmatized_Tokens,Lemmatized_Text
0,Ariana Grande,"​thank u, next",thought i'd end up with sean but he wasn't a m...,"[""nothin'"", ""havin'"", 'ari', 'pre', ""'bout"", '...",thought i'd end up with sean but he wasn't a m...,"[nothin', havin', yee, fuckin', gon']","[thought, i'd, end, up, with, sean, but, he, w...",thought i'd end up with sean but he wasn't a m...,"[thought, i'd, end, up, with, sean, but, he, w...",thought i'd end up with sean but he wasn't a m...
1,Ariana Grande,7 rings,yeah breakfast at tiffany's and bottles of bub...,"['redbottoms', ""tiffany's"", ""gleamin'"", ""looki...",yeah breakfast at tiffany and bottles of bubbl...,"[beamin', lookin', settin', poppin', watchin',...","[yeah, breakfast, at, tiffany, and, bottles, o...",yeah breakfast at tiffany and bottles of bubbl...,"[yeah, breakfast, at, tiffany, and, bottle, of...",yeah breakfast at tiffany and bottle of bubble...
2,Ariana Grande,​god is a woman,you you love it how i move you you love it how...,"[""feelin'"", 'pre', 'uniwhen', 'onetrack', ""tel...",you you love it how i move you you love it how...,"[tellin', feelin']","[you, you, love, it, how, i, move, you, you, l...",you you love it how i move you you love it how...,"[you, you, love, it, how, i, move, you, you, l...",you you love it how i move you you love it how...
3,Ariana Grande,side to side,ariana grande nicki minaj i've been here all n...,"[""feelin'"", 'minime', 'minaj', ""'em"", ""makin'""...",grand nick mina i've been here all night i've ...,"[talkin', nothin', feelin', comin', makin', wa...","[grand, nick, mina, i've, been, here, all, nig...",grand nick mina i've been here all night i've ...,"[grand, nick, mina, i've, been, here, all, nig...",grand nick mina i've been here all night i've ...
4,Ariana Grande,​​no tears left to cry,right now i'm in a state of mind i want to be ...,"[""turnin'"", ""comin'"", 'pre', ""pickin'"", ""lovin...",right now i'm in a state of mind i want to be ...,"[turnin', comin', pickin', lovin', livin', rai...","[right, now, i'm, in, a, state, of, mind, i, w...",right now i'm in a state of mind i want to be ...,"[right, now, i'm, in, a, state, of, mind, i, w...",right now i'm in a state of mind i want to be ...
...,...,...,...,...,...,...,...,...,...,...
5706,Taylor Swift,teardrops on my guitar (live from clear channe...,drew looks at me i fake a smile so he won't se...,"['pre', ""'cause"", 'whos']",drew looks at me i fake a smile so he won't se...,[],"[drew, looks, at, me, i, fake, a, smile, so, h...",drew looks at me i fake a smile so he won't se...,"[drew, look, at, me, i, fake, a, smile, so, he...",drew look at me i fake a smile so he won't see...
5707,Taylor Swift,evermore [forward],to put it plainly we just couldnt stop writing...,"['folklorian', 'couldnt', 'oneoff', 'imaginary...",to put it plainly we just stop writing songs t...,[],"[to, put, it, plainly, we, just, stop, writing...",to put it plainly we just stop writing songs t...,"[to, put, it, plainly, we, just, stop, writing...",to put it plainly we just stop writing song to...
5708,Taylor Swift,welcome back grunwald,turn wycd on you're on your grunwald back from...,"['grunwald', ""grunwald's"", 'wycd']",turn wynd on you're on your back from the amaz...,[],"[turn, wynd, on, you're, on, your, back, from,...",turn wynd on you're on your back from the amaz...,"[turn, wynd, on, you're, on, your, back, from,...",turn wynd on you're on your back from the amaz...
5709,Taylor Swift,tolerate it (polskie tłumaczenie),zwrotka siedzę i patrzę jak czytasz z głową po...,"['jeśli', 'jak', 'czekam', 'cię', 'głową', 'ko...",i jam z i jail z eczema i co i zero nice type ...,"[gra, za, sie]","[i, jam, z, i, jail, z, eczema, i, co, i, zero...",i jam z i jail z eczema i co i zero nice type ...,"[i, jam, z, i, jail, z, eczema, i, co, i, zero...",i jam z i jail z eczema i co i zero nice type ...


## Count Vectorization

In [221]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Define custom tokenizer that removes underscores
def custom_tokenizer(text):
    pattern = r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?'?|[.,!?']"
    tokens = re.findall(pattern, text)
    tokens = [token for token in tokens if not re.fullmatch(r'_+', token)]
    return tokens

# Use CountVectorizer with custom tokenizer
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, lowercase=True)
X = vectorizer.fit_transform(filtered_df['Lemmatized_Text'])




In [222]:
print(vectorizer.get_feature_names_out())
print(X.toarray())

["'" 'a' "a's" ... 'zooming' 'zucchini' 'zulu']
[[ 0  3  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0 16  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  3  0 ...  0  0  0]]


In [223]:
df_CV = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_CV

Unnamed: 0,',a,a's,aa,aah,aalii,aaliyah,aaliyah's,aaron,ab,...,zone,zoned,zonin',zoning,zoo,zoom,zoomed,zooming,zucchini,zulu
0,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5705,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5706,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5707,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [224]:
df_CV.columns.tolist()

["'",
 'a',
 "a's",
 'aa',
 'aah',
 'aalii',
 'aaliyah',
 "aaliyah's",
 'aaron',
 'ab',
 'aba',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abate',
 'abbey',
 'abbreviate',
 'abbreviation',
 'abdomen',
 'abdominal',
 'abducted',
 'abduction',
 'abdul',
 'abe',
 'abel',
 'aberration',
 "abidin'",
 'abigail',
 'ability',
 'ablaze',
 'able',
 'aboard',
 'abohm',
 'abolished',
 'abominable',
 'abort',
 'aborted',
 'abortion',
 'about',
 'above',
 'abracadabra',
 'abraham',
 'abrasive',
 'abridge',
 'abroad',
 'abrupt',
 'absence',
 'absent',
 'absentminded',
 'absentmindedly',
 'absinth',
 'absinthe',
 'absolute',
 'absolutely',
 'absorbed',
 'abstained',
 'abstaining',
 'abstract',
 'absurd',
 'absurdly',
 'abu',
 'abundance',
 'abundantly',
 'abuse',
 'abused',
 'abuser',
 "abusin'",
 'abusing',
 'abusive',
 'abut',
 'aby',
 'abyss',
 'ac',
 "ac's",
 'academy',
 'acarus',
 'accelerant',
 'accelerate',
 'accelerated',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted

## Tf-Idf Vectorization

In [225]:
from sklearn.feature_extraction.text import TfidfVectorizer

filtered_df['Lemmatized_Text'] = filtered_df['Lemmatized_Tokens'].apply(lambda tokens: ' '.join(tokens))

tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(filtered_df['Lemmatized_Text'])

print(X_tfidf.shape)  

print(tfidf_vectorizer.get_feature_names_out()[:10])

tfidf_array = X_tfidf.toarray()
print(tfidf_array[0]) 


(5709, 20890)
['aa' 'aah' 'aalii' 'aaliyah' 'aaron' 'ab' 'aba' 'aback' 'abandon'
 'abandoned']
[0. 0. 0. ... 0. 0. 0.]


In [230]:
df_Tf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_Tf

Unnamed: 0,aa,aah,aalii,aaliyah,aaron,ab,aba,aback,abandon,abandoned,...,zone,zoned,zonin,zoning,zoo,zoom,zoomed,zooming,zucchini,zulu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [231]:
df_Tf.columns.tolist()

['aa',
 'aah',
 'aalii',
 'aaliyah',
 'aaron',
 'ab',
 'aba',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abate',
 'abbey',
 'abbreviate',
 'abbreviation',
 'abdomen',
 'abdominal',
 'abducted',
 'abduction',
 'abdul',
 'abe',
 'abel',
 'aberration',
 'abidin',
 'abigail',
 'ability',
 'ablaze',
 'able',
 'aboard',
 'abohm',
 'abolished',
 'abominable',
 'abort',
 'aborted',
 'abortion',
 'about',
 'above',
 'abracadabra',
 'abraham',
 'abrasive',
 'abridge',
 'abroad',
 'abrupt',
 'absence',
 'absent',
 'absentminded',
 'absentmindedly',
 'absinth',
 'absinthe',
 'absolute',
 'absolutely',
 'absorbed',
 'abstained',
 'abstaining',
 'abstract',
 'absurd',
 'absurdly',
 'abu',
 'abundance',
 'abundantly',
 'abuse',
 'abused',
 'abuser',
 'abusin',
 'abusing',
 'abusive',
 'abut',
 'aby',
 'abyss',
 'ac',
 'academy',
 'acarus',
 'accelerant',
 'accelerate',
 'accelerated',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'acceptin',
 'accepting',
 'access',
 'ac

## Feature Engineering

In [None]:
##

In [None]:
# # Creating a mapping dictionary from artist to genre
# genre_map = {
#     'Eminem': 'Hip Hop / Rap',
#     'Taylor Swift': 'Pop / Country / Synthpop',
#     'Drake': 'Hip Hop / Rap / R&B',
#     'Beyoncé': 'R&B / Pop / Hip Hop',
#     'Rihanna': 'Pop / R&B / Dancehall',
#     'Lady Gaga': 'Pop / Dance / Electronic',
#     'Justin Bieber': 'Pop / R&B',
#     'Coldplay': 'Alternative Rock / Pop Rock',
#     'Katy Perry': 'Pop',
#     'Nicki Minaj': 'Hip Hop / Rap / Pop',
#     'Ariana Grande': 'Pop / R&B',
#     'Ed Sheeran': 'Pop / Folk Pop',
#     'BTS (방탄소년단)': 'K-pop / Pop / Hip Hop',
#     'Dua Lipa': 'Pop / Dance / Disco',
#     'Maroon 5': 'Pop Rock / Funk Pop',
#     'Selena Gomez': 'Pop / Dance Pop',
#     'Post Malone': 'Hip Hop / Pop / Trap',
#     'Billie Eilish': 'Alternative / Pop / Electropop',
#     'Charlie Puth': 'Pop / R&B',
#     'Cardi B': 'Hip Hop / Rap',
#     'Khalid': 'R&B / Pop / Soul'
# }

In [None]:
# Adding the new 'Genre' column based on the mapping

# df['Genre'] = df['Artist'].map(genre_map)

In [None]:
# df.head()

In [None]:
# df['Genre'].isnull().sum()

In [None]:
# df['Genre'].value_counts()

In [None]:
# df['Genre'].unique()