# Text Classification

_Project for week 4, by Jan Kühn, April 2023_


## Project task and outline

In this project, we will build a text classification model on song lyrics. The task is to predict the artist from a piece of text. To train such a model, you first need to collect your own lyrics dataset:

- Download a HTML page with links to songs
- Extract hyperlinks of song pages
- Download and extract the song lyrics
- Vectorize the text using the Bag Of Words method
- Train a classification model that predicts the artist from a piece of text
- Refactor the code into functions
- Write a simple command-line interface for the program
- Upload your code to GitHub


In [None]:
import time

import nltk
import pandas as pd
from includes import misc, modelling, parse
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
# Download NLTK data needed for this project
nltk.download("wordnet")
nltk.download("stopwords")

In [None]:
SCRAPE_SONG_LIST = False
SCRAPE_SONGS = False
PARSE_HTML = False
CREATE_WORDCLOUDS = False
SLEEP_SEC = 3
HEADER = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0"
}
artist_urls = {
    "Eels": "https://www.lyrics.com/artist.php?name=Eels&aid=182509&o=1",
    "Rage Against the Machine": "https://www.lyrics.com/artist.php?name=Rage-Against-the-Machine&aid=23206&o=1",
    "Adele": "https://www.lyrics.com/artist.php?name=Adele&aid=861756&o=1",
}

### Run the code


In [None]:
if PARSE_HTML:
    # Get song URLs
    song_urls = parse.get_song_urls(artist_urls)

    # Parse lyrics from file and save them in a CSV file
    songs = parse.parse_lyrics_from_files(song_urls)

In [None]:
songs = pd.read_csv("data/songs_clean.csv", index_col=0)
songs

In [None]:
df_corpus = misc.convert_lyrics_to_lines(songs)
df_corpus["artist"].value_counts(normalize=True)

## Wordcloud


Eels


In [None]:
if CREATE_WORDCLOUDS:
    corpus = " ".join(df_corpus[df_corpus["artist"] == "Eels"]["lyrics"])
    misc.plot_wordcloud(corpus, name="Eels", shape="circle")

Rage Against the Machine


In [None]:
if CREATE_WORDCLOUDS:
    corpus = " ".join(
        df_corpus[df_corpus["artist"] == "Rage Against the Machine"]["lyrics"]
    )
    misc.plot_wordcloud(corpus, name="ratm", shape="text")

#### Adele


In [None]:
if CREATE_WORDCLOUDS:
    corpus = " ".join(
        df_corpus[df_corpus["artist"] == "Adele"]["lyrics"]
    )
    misc.plot_wordcloud(corpus, name="Adele", shape="circle")

## Model

### Prepare corpus

In [None]:
# Prepare corpus and labels
CORPUS, LABELS = modelling.prepare_corpus(df_corpus)
assert(len(CORPUS) == len(LABELS))

In [None]:
# Preprocess data
CORPUS_CLEAN = modelling.preprocess_corpus(CORPUS)
assert(len(CORPUS_CLEAN) == len(LABELS))

### Instantiate the model

In [None]:
# Get stop words
STOPWORDS = list(stopwords.words('english'))

### Hyperparameter tuning

In [None]:
model = Pipeline(
    steps=[
        ("tdidf", TfidfVectorizer(stop_words=STOPWORDS)),
        ("nb", MultinomialNB()),
    ]
)

param_grid = {
    "nb__alpha": [0.1, 0.5, 1, 2, 3],
    "nb__fit_prior": [True, False],
    "tdidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
}

gscv = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=1,
)

# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(CORPUS_CLEAN, LABELS)

# final time
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

print(f"Best parameters: {gscv.best_params_}")
print(f"Best score: {round(gscv.best_score_,6)}")

model = gscv.best_estimator_

### Fit Classification model

In [None]:
# Fit the model with the vectorized data
model.fit(CORPUS_CLEAN, LABELS)

# Check score
model.score(CORPUS_CLEAN, LABELS)

### Use the trained model to predict for new lyrics

In [None]:
lyrics = [
    "From the era of terror, check this photo lens",
    "beautiful freak",
    "Fuck you I won't do what you tell me",
    "Bombtrack",
    "the mistakes of my youth",
    "Check it, since fifteen hundred and sixteen, minds attacked and overseen",
    "Shock around tha clock, from noon 'til noon",
    "When I came into this world they slapped me",
    "Or should I just keep chasing pavements?",
]

In [None]:
# Preprocess
lyrics_clean = modelling.preprocess_corpus(lyrics)

# Get results
predictions = model.predict(lyrics_clean)
probabilities = [p.max() for p in model.predict_proba(lyrics_clean)]

# Print results
modelling.print_results(lyrics, predictions, probabilities)

## User input

In [None]:
keep_asking = True

while keep_asking:
    user_input = input("Enter a line from a song by the Eels, Adele, or Rage Against the Machine")

    if user_input in ["quit", "q", "exit"]:
        keep_asking = False
        continue

    lyrics = [user_input]

    # Preprocess
    lyrics_clean = modelling.preprocess_corpus(lyrics)

    # Get results
    predictions = model.predict(lyrics_clean)
    probabilities = [p.max() for p in model.predict_proba(lyrics_clean)]

    # Print results
    modelling.print_results(lyrics, predictions, probabilities)
