In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

filename_tracks = 'E:/Downloads/archive-features/tracks.csv'
filename_lyr = "E:/Downloads/archive-lyrics/lyrics-data.csv"
filename_gen = "E:/Downloads/archive-lyrics/artists-data.csv"

In [None]:
# Function to clean and format artist names consistently
def clean_artist_name(artist_name):
    try:
        artist_name = re.sub(r'[^\w\s]', '', artist_name)  # Remove non-alphanumeric characters
        artist_name = re.sub(r'\s+', ' ', artist_name)  # Replace multiple spaces with a single space
        artist_name = artist_name.strip() # Remove surrounding whitespace
        artist_name = artist_name.lower()
    except Exception as e:
        print(f"Error cleaning artist name: {artist_name}")
        raise e
    return artist_name

# Cleanup genre column
def split_genres(genre_string):
    if isinstance(genre_string, str):
        delimiters = [',', ';', '/']  # Hardcoded delimiters
        for delimiter in delimiters:
            genre_string = genre_string.replace(delimiter, '; ')
        genres = genre_string.split('; ')
        return genres
    else:
        return []

# Pre-process lyrics
def preprocess_text(text, stop_filter=True, flg_stemm=False, flg_lemm=True):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    words = text.split()

    ## remove Stopwords
    if stop_filter:
        stop_word_set = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_word_set]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        words = [ps.stem(word) for word in words]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        words = [lem.lemmatize(word) for word in words]
            
    ## back to string from list
    text = " ".join(words)
    return text

In [None]:
# Get song dataset with musical features
df = pd.read_csv(filename_tracks)

df.head()

In [None]:
# Get lyrics dataset, skip problematic row
df_lyr = pd.read_csv(filename_lyr, skiprows=range(177701, 177702))

df_lyr.head()

In [None]:
# Clean and format the 'ALink' column in df_lyr (already in consistent format so no need to run through clean_artist_name)
df_lyr['ALink'] = df_lyr['ALink'].str.strip('/')
df_lyr['ALink'] = df_lyr['ALink'].str.replace('-', ' ')
df_lyr['ALink'] = df_lyr['ALink'].str.lower()

# Apply the clean_artist_name function to 'artists' column in df
df['artists'] = df['artists'].apply(clean_artist_name)

# Merge the dataframes based on the matching values in 'SName', 'name', 'ALink', and 'artists' columns
df_comb = pd.merge(df, df_lyr, left_on=['name', 'artists'], right_on=['SName', 'ALink'], how='inner')

# Drop the unnecessary columns ('SName' and 'ALink')
df_comb.drop(['SName', 'ALink'], axis=1, inplace=True)

df_comb.head()

In [None]:
# Get genre dataset
df_gen = pd.read_csv(filename_gen)

df_gen.head()

In [None]:
# Remove problematic NaN rows
df_gen.dropna(subset=['Artist'], inplace=True)

# Remove unnecessary columns 
df_gen.drop(['Popularity', 'Link', 'Songs'], axis=1, inplace=True)

# Apply the clean_artist_name function to 'Artist' column in df for comparison
df_gen['Artist'] = df_gen['Artist'].apply(clean_artist_name)

In [None]:
# Search and append genres from df_gen to df_comb based on artist 
for index, row in df_comb.iterrows():
    artist = row['artists']
    genres = df_gen.loc[df_gen['Artist'] == artist, 'Genres'].values
    if len(genres) > 0:
        df_comb.at[index, 'Genres'] = genres[0]

# Confirm all desired columns are present
print(df_comb.columns)

# Size check
print(len(df_comb))

In [None]:
# Remove problematic NaN rows
df_comb.dropna(subset=['Genres'], inplace=True)

# Size check
print(len(df_comb))

In [None]:
# Clean-up and split genres into list of strings
df_comb['Genres'] = df_comb['Genres'].apply(split_genres)

# Clean-up genre substrings
df_comb['Genres'] = df_comb['Genres'].apply(lambda x: [genre.strip() for genre in x])

In [None]:
# Remove duplicates that match song name and artist name keeping the one with highest popularity
df_comb.sort_values('popularity', ascending=False, inplace=True)
df_comb.drop_duplicates(subset=['name', 'artists'], keep='first', inplace=True)

# Size check
print(len(df_comb))

In [None]:
# Add column of data processing on lyrics
df_comb['clean lyric'] = df_comb['Lyric'].apply(lambda x: preprocess_text(x))

Quick sanity check on datset

In [None]:
# See all possible genres
genre_list = df_comb['Genres'].explode().unique().tolist()
print(genre_list)

In [None]:
# Filter rows based on the 'artists' column matching the specified string
filtered_df = df_comb[df_comb['artists'] == 'louis armstrong']

# Print the filtered dataframe (confirms no more duplicates, genres correct, cleaned lyrics correct)
filtered_df

Save finalized dataframe as csv

In [None]:
# Save df_comb to a CSV file
df_comb.to_csv('df_comb_v3.csv', index=False)