In [14]:

import pandas as pd
import matplotlib.pyplot as plt
import re
from langdetect import detect, LangDetectException

# Load the dataset
df = pd.read_csv("output/songs_with_lyrics.csv")
print(f"Total songs loaded: {len(df)}")


Total songs loaded: 614


In [15]:

# 1. Count Lines
def count_lines(text):
    if pd.isna(text): return 0
    return len(str(text).split('\n'))

df['line_count'] = df['lyrics'].apply(count_lines)

# 2. Count Characters
df['char_count'] = df['lyrics'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

# 3. Detect Tracklists
def is_tracklist(text):
    if pd.isna(text): return False
    matches = re.findall(r'^\d+\.', str(text), re.MULTILINE)
    return len(matches) > 3

df['is_tracklist'] = df['lyrics'].apply(is_tracklist)

# 4. Count Symbols
def count_symbols(text):
    if pd.isna(text): return 0
    return str(text).count('-') + str(text).count('>') + str(text).count(':')

df['symbol_count'] = df['lyrics'].apply(count_symbols)

# 5. Detect Annotated URLs
def is_annotated(url):
    return str(url).endswith('annotated')

df['is_annotated'] = df['url'].apply(is_annotated)

# 6. Detect Language
def detect_language(text):
    if pd.isna(text) or len(str(text).strip()) < 10: return 'unknown'
    try: return detect(str(text))
    except LangDetectException: return 'error'

print("Detecting languages...")
df['language'] = df['lyrics'].apply(detect_language)

# Show statistics
print("\nStatistics:")
print(df[['line_count', 'char_count', 'symbol_count']].describe())
print(f"\nTracklists detected: {df['is_tracklist'].sum()}")
print(f"Annotated URLs detected: {df['is_annotated'].sum()}")
print(f"Non-English songs detected: {len(df[df['language'] != 'en'])}")


Detecting languages...

Statistics:
        line_count     char_count  symbol_count
count   614.000000     614.000000    614.000000
mean    311.801303    8010.833876    159.255700
std     468.019331   21889.443694    326.262379
min       0.000000       0.000000      0.000000
25%      50.000000    1495.250000      2.250000
50%      86.500000    2709.500000     16.000000
75%     448.000000    7779.000000    259.500000
max    3853.000000  413767.000000   4592.000000

Tracklists detected: 44
Annotated URLs detected: 301
Non-English songs detected: 39


In [16]:

# Remove "Read More" header and everything before it
def lyrics_process(lyrics):
    if pd.isna(lyrics): return ""
    lyrics = str(lyrics)
    # Check for "Read More" (case-sensitive based on your example)
    # If found, keep everything AFTER it
    if "Read More" in lyrics:
        return lyrics.split("Read More", 1)[-1].strip()
    return lyrics.split("Lyrics", 1)[-1].strip()

# Apply to dataframe
print("Applying 'Read More' preprocessing...")
df['lyrics'] = df['lyrics'].apply(lyrics_process)
print("Preprocessing complete.")

# Test on a sample (optional, using the first row)
if len(df) > 0:
    print("Sample processed lyric start:")
    print(df['lyrics'].iloc[0][:100] + "...")


Applying 'Read More' preprocessing...
Preprocessing complete.
Sample processed lyric start:
[Intro]
Last night, I cried
[Verse]
Give me a second, give me a minute
Nah, lil' bitch, can't let yo...


In [17]:

# Define thresholds
MIN_LINES = 10
MAX_LINES = 150
MAX_CHARS = 5000
MAX_SYMBOLS = 10

# 1. Remove Duplicate URLs first (Aggressive removal: keep=False removes ALL duplicates)
print(f"Total before duplicate removal: {len(df)}")
df = df.drop_duplicates(subset=['url'], keep=False)
print(f"Total after duplicate removal: {len(df)}")

# 2. Apply Filters
cond_lines = (df['line_count'] >= MIN_LINES) & (df['line_count'] <= MAX_LINES)
cond_chars = (df['char_count'] <= MAX_CHARS)
# cond_symbols = (df['symbol_count'] <= MAX_SYMBOLS) # Disabled per request
cond_not_tracklist = (~df['is_tracklist']) # Restored (Numeric lists like "1. Song")
cond_not_annotated = (~df['is_annotated'])
cond_english = (df['language'] == 'en')

# Combine filters (Removed cond_symbols)
df_cleaned = df[cond_lines & cond_chars & cond_not_tracklist & cond_not_annotated & cond_english]

print(f"Songs before cleaning: {len(df)}")
print(f"Songs after cleaning: {len(df_cleaned)}")
print(f"Removed {len(df) - len(df_cleaned)} songs (filters)")

# Show removed samples
removed = df[~(cond_lines & cond_chars & cond_not_tracklist & cond_not_annotated & cond_english)]
if not removed.empty:
    print("\nSample of REMOVED entries (showing reason):")
    cols = ['title', 'char_count', 'is_tracklist', 'language']
    print(removed[cols].head(10))


Total before duplicate removal: 614
Total after duplicate removal: 428
Songs before cleaning: 428
Songs after cleaning: 257
Removed 171 songs (filters)

Sample of REMOVED entries (showing reason):
                                   title  char_count  is_tracklist language
2                Christmas Album Cleanup        3557         False       en
7                     IC’s Listening Log       61291          True       en
10                INTRO (Short n’ Sweet)         210         False       en
13                  spotify wrapped 2025         658          True       en
17          Every Anthony Fantano Review       55798         False       en
23          Faouziafan01's CD collection         348         False       en
31  November 2023 Album Release Calendar       11746         False       en
33                          Cosa Nuestra        2167         False       es
34                         2024 Nominees        5799         False       en
35               Albums I’ve Listened To   

In [18]:

df_cleaned.to_csv("output/songs_with_lyrics_cleaned.csv", index=False)
print("Saved cleaned data to output/songs_with_lyrics_cleaned.csv")


Saved cleaned data to output/songs_with_lyrics_cleaned.csv
