# Text Classification

_Project for week 4, by Jan Kühn, April 2023_


## Project task and outline

In this project, we will build a text classification model on song lyrics. The task is to predict the artist from a piece of text. To train such a model, you first need to collect your own lyrics dataset:

- Download a HTML page with links to songs
- Extract hyperlinks of song pages
- Download and extract the song lyrics
- Vectorize the text using the Bag Of Words method
- Train a classification model that predicts the artist from a piece of text
- Refactor the code into functions
- Write a simple command-line interface for the program
- Upload your code to GitHub


In [None]:
import os
import re
import time

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from PIL import Image, ImageDraw, ImageFont
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from wordcloud import STOPWORDS, WordCloud

In [None]:
SCRAPE_PATH = "scrape/"
SCRAPE_SONG_LIST = False
SCRAPE_SONGS = False
PARSE_HTML = False
CREATE_WORDCLOUDS = False
SLEEP_SEC = 3
HEADER = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0"
}
artist_urls = {
    "Eels": "https://www.lyrics.com/artist.php?name=Eels&aid=182509&o=1",
    "Rage Against the Machine": "https://www.lyrics.com/artist.php?name=Rage-Against-the-Machine&aid=23206&o=1",
    "Adele": "https://www.lyrics.com/artist.php?name=Adele&aid=861756&o=1",
}

### Define functions


#### Scrape & Parse

In [None]:
def shorten_artist(artist: str) -> str:
    """
    Function to shorten the artist name.
    """
    return "".join(re.findall(r'\b\w', artist)).lower() if len(artist.split(" ")) > 1 else artist.lower()

In [None]:
def scrape_artist_song_list(artist_urls: dict[str, str]) -> None:
    """
    Function to scrape song list from a website and save them as files.
    """

    for artist, url in artist_urls.items():
        # Create directory for scraped files if it doesn't exist
        if not os.path.exists(SCRAPE_PATH):
            os.makedirs(SCRAPE_PATH)

        file_name = f"{shorten_artist(artist)}_full_song_list.html"

        # Do nothing if file exists already
        if os.path.isfile(os.path.join(SCRAPE_PATH, file_name)):
            print(f"Skipped existing file {file_name}.")
            continue

        # If file does not exist, fetch it
        response = requests.get(url, HEADER, allow_redirects=False)

        if response.status_code == 200:
            with open(SCRAPE_PATH + file_name, "w") as f:
                f.write(response.text)

            print(f"Song list for {artist} written to file {SCRAPE_PATH}{file_name}")

        else:
            print(f"Error: Response code {response.status_code} for URL {url}.")

        time.sleep(SLEEP_SEC)

In [None]:
def remove_duplicate_urls(song_urls: dict[str, list]) -> dict[str, list]:
    """
    Function to remove duplicate URLs.
    """
    song_urls_clean = {}

    for artist, urls in song_urls.items():
        urls_clean = []
        count_append = 0
        count_remove = 0

        for url in urls:
            end_of_url = "/".join(url.rsplit("/", 2)[1:3])

            # Check if string already exists in list
            if not any(end_of_url in c for c in urls_clean):
                urls_clean.append(url)
                count_append += 1
            else:
                count_remove += 1

        print(
            f"{count_remove} duplicates removed, leaving {count_append} URLs for {artist}."
        )

        song_urls_clean[artist] = urls_clean

    return song_urls_clean

In [None]:
def get_song_urls(artist_urls: dict[str, str]) -> dict[str, list]:
    """
    Function to get song URLs from HTML files.
    """

    # Scrape HTML files containing URLs to song lyrics
    if SCRAPE_SONG_LIST:
        scrape_artist_song_list(artist_urls)

    song_urls = {}

    for artist in artist_urls:
        file_name = f"{shorten_artist(artist)}_full_song_list.html"
        count = 0

        with open(SCRAPE_PATH + file_name, "r") as f:
            html = f.read()

            soup = BeautifulSoup(html, "html.parser")

            parsed_urls = []

            for row in soup.find("table", class_="tdata").find_all("tr"):
                try:
                    # Get URL from href
                    url = row.find("td").find("a", href=True)["href"]
                except:
                    continue

                # Append to list
                parsed_urls.append("https://www.lyrics.com" + url)
                count += 1

        song_urls[artist] = parsed_urls
        print(f"Added {count} URLs for artist {artist}.")

    # Remove duplicate URLS
    song_urls_clean = remove_duplicate_urls(song_urls)

    return song_urls_clean

In [None]:
def scrape_songs_to_files(song_urls: dict[str, list]) -> None:
    """
    Function to scrape songs and save them locally.
    """
    for artist, urls in song_urls.items():
        path = SCRAPE_PATH + shorten_artist(artist) + "/"
        count_skipped = 0

        # Create directory for scraped files if it doesn't exist
        if not os.path.exists(path):
            os.makedirs(path)

        for url in urls:
            file_name = f"{shorten_artist(artist)}-{url.split('/')[-1]}.html"

            # Do nothing if file exists already
            if os.path.isfile(os.path.join(path, file_name)):
                count_skipped += 1
                continue

            # GET file
            response = requests.get(url, HEADER, allow_redirects=False)

            if response.status_code == 200:
                with open(path + file_name, "w") as f:
                    f.write(response.text)

                print(f"File {path + file_name} for {artist} written to file.")

            else:
                print(f"Error: Response code {response.status_code} for URL {url}.")

            time.sleep(SLEEP_SEC)

        print(f"Skipped {count_skipped} existing files for artist {artist}.")

In [None]:
def parse_html(html: str, source: str = "") -> tuple[str, str, str]:
    """
    Function to parse HTML and extract title, artist and lyrics
    """

    title, artist, lyrics = "", "", ""

    soup = BeautifulSoup(html, "html.parser")

    # Extract title, artits, and lyrics
    try:
        title = soup.h1.text.strip()
    except Exception:
        print(f"Error parsing title at {source}.")
        pass

    try:
        artist = soup.find("h3", class_="lyric-artist").text.strip()
    except Exception:
        print(f"Error parsing artist at {source}.")
        pass

    try:
        lyrics = soup.find(id="lyric-body-text").text.strip()
    except Exception:
        print(f"Error parsing lyrics at {source}.")
        pass

    return title, artist, lyrics

In [None]:
def get_lyrics_from_file(path_html: str) -> tuple[str, str, str]:
    """
    Function to scrape one single song lyric from html file
    """

    with open(path_html, "r") as f:
        html = f.read()

    title, artist, lyrics = parse_html(html, path_html)

    return title, artist, lyrics

In [None]:
def get_lyrics_from_url(url: str) -> tuple[str, str, str]:
    """
    Function to scrape one single song lyric from URL
    """

    title, artist, lyrics = "", "", ""

    response = requests.get(url, HEADER, allow_redirects=False)

    if response.status_code == 200:
        title, artist, lyrics = parse_html(response.text, url)
    else:
        print(f"Error: Response code {response.status_code} for URL {url}.")

    return title, artist, lyrics

In [None]:
def get_files_to_parse(artists: list[str]) -> dict:
    """
    Function to get file names in the scrape directory.
    """

    all_files = {}

    for artist in artists:
        # Directory holding HTML files
        path_html_files = SCRAPE_PATH + "/" + shorten_artist(artist)

        all_files[artist] = [
            f
            for f in os.listdir(path_html_files)
            if os.path.isfile(os.path.join(path_html_files, f)) and f.endswith(".html")
        ]

    return all_files

In [None]:
def parse_lyrics_from_files(song_urls: dict[str, list[str]]) -> pd.DataFrame:
    """
    Function to parse lyrics from existing files.
    """

    # Scrape lyrics from URLS
    if SCRAPE_SONGS:
        scrape_songs_to_files(song_urls)

    # Create empty DataFrame
    songs = pd.DataFrame(columns=["title", "artist", "lyrics"])

    # Get file names
    files_to_parse = get_files_to_parse(list(song_urls.keys()))
    
    # Loop through file names and parse HTML
    for artist, files in files_to_parse.items():
        for f in files:
            path_html_file = SCRAPE_PATH + shorten_artist(artist) + "/" + f
            title_, artist_, lyrics_ = get_lyrics_from_file(path_html_file)
            songs.loc[len(songs)] = [title_, artist_, lyrics_]
            
    # Clean data
    songs_clean = clean_data(songs)

    # Save DataFrame to CSV
    file_name_csv_clean = "data/songs_clean.csv"
    songs_clean.to_csv(file_name_csv_clean)

    print(f"Saved {len(songs_clean)} songs to {file_name_csv_clean}")

    return songs_clean

#### Dataframes

In [None]:
def clean_data(df_: pd.DataFrame) -> pd.DataFrame:
    """
    Function to clean the data.
    """
    # Remove all rows where lyrics cell is empty
    df_ = df_[df_["lyrics"].notna()]
    df_ = df_[df_["lyrics"] != ""]

    # Remove all songs that are not exactly by artists specified
    filters = []
    for a in artist_urls:
        filters.append(df_["artist"].str.lower() == a.lower())
    filter = [any(sublist) for sublist in zip(*filters)]
    df_ = df_[filter]

    # Remove all rows where title ends with ]
    df_ = df_[df_["title"].str[-1] != "]"]

    return df_

In [None]:
def convert_lyrics_to_lines(df_: pd.DataFrame) -> pd.DataFrame:
    """
    Function to split the lyrics by line.
    """
    df_ = (
        # Set columns not to be touched as index
        df_.set_index(["title", "artist"])
        # Split and explode the lyrics by newline
        .apply(lambda x: x.str.split("\n").explode())
        # Reset index
        .reset_index()
    )

    # Remove rows without lyrics in them
    df_ = df_[df_["lyrics"].notna()]
    df_ = df_[df_["lyrics"] != ""]

    df_.to_csv("data/songs_by_line.csv")
    return df_

#### Wordcloud

In [None]:
def wordcloud_create_img(text: str, width: int = 2000, height: int = 1500):
    
    # Define font to be used (downloaded from https://www.cufonfonts.com/font/boldova)
    font_file = "Boldova.ttf"
    
    if not os.path.isfile(font_file):
        print("Error: Font file not found.")
        return None
    
    font = ImageFont.truetype("Boldova.ttf", size=600)

    # Create image
    img = Image.new("RGB", (width, height), color="white")
    img_draw = ImageDraw.Draw(img)

    # Calculate coordinates to center the text
    text_width, text_height = img_draw.textsize(text, font=font)
    x_text = int((width - text_width) / 2)
    y_text = int((height - text_height) / 2)

    # Add text
    img_draw.text((x_text, y_text), text, font=font, fill=(0, 0, 0))

    return img

In [None]:
def plot_wordcloud(corpus: str, name: str, shape: str = "rect") -> None:
    # Some settings
    width = 2000
    height = 1000

    # Create shapes
    if shape == "circle":
        # From https://www.python-lernen.de/wordcloud-erstellen-python.htm
        x, y = np.ogrid[:1000, :1000]
        mask = (x - 500) ** 2 + (y - 500) ** 2 > 400**2
        mask = 255 * mask.astype(int)

        # Change width to get a square
        width = height
    elif shape == "text":
        # Change width to get a wide rectangle
        height = int(width / 2)

        # Create image with text
        wordcloud_img = wordcloud_create_img(name, width=width, height=height)
        if wordcloud_img is None:
            return None
        mask = np.array(wordcloud_img)
    else:
        mask = None

    # Generate word cloud
    wordcloud = WordCloud(
        width=width,
        height=height,
        random_state=1,
        background_color="white",
        # colormap="Pastel1",
        collocations=False,
        stopwords=STOPWORDS,
        mask=mask,
        contour_color="#ccc",
        contour_width=2,
    ).generate(corpus)

    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud)
    # No axis details
    plt.axis("off")
    # Save as file
    plt.savefig(f"wordclouds/wordcloud-{name}-{shape}.png", dpi=72, bbox_inches="tight")

### Run the code


In [None]:
if PARSE_HTML:
    # Get song URLs
    song_urls = get_song_urls(artist_urls)

    # Parse lyrics from file and save them in a CSV file
    songs = parse_lyrics_from_files(song_urls)

In [None]:
songs = pd.read_csv("data/songs_clean.csv", index_col=0)
songs

In [None]:
df_corpus = convert_lyrics_to_lines(songs)
df_corpus["artist"].value_counts(normalize=True)

## Wordcloud


Eels


In [None]:
if CREATE_WORDCLOUDS:
    corpus = " ".join(df_corpus[df_corpus["artist"] == "Eels"]["lyrics"])
    plot_wordcloud(corpus, name="Eels", shape="circle")

Rage Against the Machine


In [None]:
if CREATE_WORDCLOUDS:
    corpus = " ".join(
        df_corpus[df_corpus["artist"] == "Rage Against the Machine"]["lyrics"]
    )
    plot_wordcloud(corpus, name="ratm", shape="text")

#### Adele


In [None]:
if CREATE_WORDCLOUDS:
    corpus = " ".join(
        df_corpus[df_corpus["artist"] == "Adele"]["lyrics"]
    )
    plot_wordcloud(corpus, name="Adele", shape="circle")

## Model

In [None]:
# Download NLTK data if not already downloaded
#nltk.download("wordnet", download_dir="/home/jan/.miniconda3/envs/jupyter/nltk_data")
#nltk.download('stopwords', download_dir="/home/jan/.miniconda3/envs/jupyter/nltk_data")

### Define some functions

In [None]:
def prepare_corpus(df_c: pd.DataFrame) -> tuple[list[str], list[str]]:
    """
    Function to prepare the corpus from a dataframe.
    """

    corpus = []
    labels = []

    # Create list of song lines and labels
    for artist in df_c["artist"].unique():
        song_lines = df_c[df_c["artist"] == artist]["lyrics"]
        
        for line in song_lines:
            corpus.append(line)

        for i in range(len(song_lines)):
            labels.append(artist)

    return corpus, labels

In [None]:
def preprocess_corpus(corpus_: list[str]) -> list[str]:
    """
    Function to preprocess the data for the model.
    """

    # Convert to lowercase
    corpus_ = [s.lower().strip() for s in corpus_]

    # Tokenize and lemmatize
    corpus_clean = []

    tokenizer = TreebankWordTokenizer()
    lemmatizer = WordNetLemmatizer()

    for doc in corpus_:
        tokens = tokenizer.tokenize(text=doc)
        clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
        corpus_clean.append(clean_doc)
    
    return corpus_clean

In [None]:
def print_results(lyrics: list[str], predictions: list[str], probabilities: list[float]) -> None:
    """
    Function to print the results of a prediction.
    """
    prob_phrases = []
    for p in probabilities:
        if p < 0.6:
            prob_phrases.append("I guess")
        elif p < 0.75:
            prob_phrases.append("I believe")
        elif p < 0.9:
            prob_phrases.append("I am pretty sure")
        else:
            prob_phrases.append("I am positive")

    for l, pred, prob, phrase in zip(lyrics, predictions, probabilities, prob_phrases):
        print(f"Line: {l}\n{phrase} that line is from a {pred} song ({prob:.0%} sure)\n")

### Prepare corpus

In [None]:
# Prepare corpus and labels
CORPUS, LABELS = prepare_corpus(df_corpus)
assert(len(CORPUS) == len(LABELS))

In [None]:
# Preprocess data
CORPUS_CLEAN = preprocess_corpus(CORPUS)
assert(len(CORPUS_CLEAN) == len(LABELS))

### Instantiate the model

In [None]:
# Get stop words
STOPWORDS = list(stopwords.words('english'))

### Hyperparameter tuning

In [None]:
model = Pipeline(
    steps=[
        ("tdidf", TfidfVectorizer(stop_words=STOPWORDS)),
        ("nb", MultinomialNB()),
    ]
)

param_grid = {
    "nb__alpha": [0.1, 0.5, 1, 2, 3],
    "nb__fit_prior": [True, False],
    "tdidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
}

gscv = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=1,
)

# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(CORPUS_CLEAN, LABELS)

# final time
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

print(f"Best parameters: {gscv.best_params_}")
print(f"Best score: {round(gscv.best_score_,6)}")

model = gscv.best_estimator_

### Fit Classification model

In [None]:
# Fit the model with the vectorized data
model.fit(CORPUS_CLEAN, LABELS)

# Check score
model.score(CORPUS_CLEAN, LABELS)

### Use the trained model to predict for new lyrics

In [None]:
lyrics = [
    "From the era of terror, check this photo lens",
    "beautiful freak",
    "Fuck you I won't do what you tell me",
    "Bombtrack",
    "the mistakes of my youth",
    "Check it, since fifteen hundred and sixteen, minds attacked and overseen",
    "Shock around tha clock, from noon 'til noon",
    "When I came into this world they slapped me",
    "Or should I just keep chasing pavements?",
]

In [None]:
# Preprocess
lyrics_clean = preprocess_corpus(lyrics)

# Get results
predictions = model.predict(lyrics_clean)
probabilities = [p.max() for p in model.predict_proba(lyrics_clean)]

# Print results
print_results(lyrics, predictions, probabilities)

## User input

In [None]:
keep_asking = True

while keep_asking:
    user_input = input("Enter a line from a song by the Eels, Adele, or Rage Against the Machine")

    if user_input in ["quit", "q", "exit"]:
        keep_asking = False
        continue

    lyrics = [user_input]

    # Preprocess
    lyrics_clean = preprocess_corpus(lyrics)

    # Get results
    predictions = model.predict(lyrics_clean)
    probabilities = [p.max() for p in model.predict_proba(lyrics_clean)]

    # Print results
    print_results(lyrics, predictions, probabilities)
