# TF-IDF with Logistic Regression

In [62]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import utils

%matplotlib inline

In [63]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_since_2013.csv")

In [64]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,2 Chainz,ColleGrove,Not Invited,[Intro]\nOf course I'ma stunt\nOf course I'ma ...
1,2 Chainz,Based On A T.R.U. Story,Money Machine,"[Intro]\nI told them, get on my level\nBitch, ..."
2,2 Chainz,ColleGrove,MF'N Right,[Produced by Mike WiLL Made It & Zaytoven]\n\n...
3,2 Chainz,ColleGrove,100 Joints,[Hook]\nYeah uum\nNo matter what they say I sm...
4,2 Chainz,Based On A T.R.U. Story,I'm Different,"[Intro: YG]\nMustard on the beat, ho!\n\n[Hook..."


In [65]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]
tracks_by_artist.tail()

Unnamed: 0,artist,album,track,lyrics
1049,Z-Ro,Drankin & Drivin,Devil Ass City,[Hook]\nI only fear god in this devil ass city...
1051,Z-Ro,Legendary,"Dome, Kush, and Codeine","[Hook]\nDome, kush, and codeine\nYou know that..."
1053,Z-Ro,Drankin & Drivin,New Shit,[Hook]\nIf you don't like my new shit you can ...
1056,Z-Ro,Legendary,Out His Mind,[Chorus: Z-Ro]\nZ-ro must be out his mind\nThi...
1058,Z-Ro,Drankin & Drivin,Hate Me So Much,"[Verse 1]\nY'all can keep on talking, dropping..."


In [66]:
# sample = tracks_by_artist[tracks_by_artist["artist"].isin(["2 Chainz","A$AP Rocky"])]

In [67]:
# Helper functions

def tokenizer(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("\n", " ", raw_lyrics)
    # Ignore case    
    raw_lyrics = raw_lyrics.lower()
    word_list = raw_lyrics.split()

    return word_list

def normalise_vector(vector):
    return vector / np.sqrt(np.dot(vector,vector))

def add_vectors(vectors):
    return normalise_vector(np.sum(vectors, axis=0))

def ranked_words_by_weighting(vector, top_n):
    return sorted(zip(vectorizer.get_feature_names(), vector), key=lambda x: x[1], reverse=True)[:top_n]

def most_representative_songs(vector, tracks_by_artist, top_n):
    df = tracks_by_artist.copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[:top_n]

def most_similar_artists(vector, artist_vectors, top_n):
    df = artist_vectors.reset_index().copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[1:top_n+1]

In [68]:
# ignore for time being
# hyperparamters = [
#     {"max_df": 1.0, "min_df": 1}, # i.e. default values
#     {"max_df": 1.0, "min_df": 0.001},
#     {"max_df": 0.7, "min_df": 1},
#     {"max_df": 0.7, "min_df": 0.001}
# ]

In [87]:
# for hp in hyperparamters:

vectorizer = TfidfVectorizer(
    stop_words=ENGLISH_STOP_WORDS,
    lowercase=False,
    max_df=0.7,
    min_df=5,
    ngram_range=(1,3),
    tokenizer=tokenizer,
    norm=None
)

tracks_by_artist_copy = tracks_by_artist.copy()
tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
# Found case where unnormalised vector is zero vector, in this case an Eminem skit
tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]

In [88]:
train, test = train_test_split(tracks_by_artist_copy, test_size=0.3, random_state=42)

In [89]:
y_train, X_train = train["artist"].values.tolist(), train["normalised_vectors"].values.tolist()
y_test, X_test = test["artist"].values.tolist(), test["normalised_vectors"].values.tolist()

In [101]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.3018181818181818
Testing F1 score: 0.29376704816277005


  'precision', 'predicted', average, warn_for)
