# TF-IDF with Logistic Regression

In [66]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import utils

%matplotlib inline

In [67]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_for_top_10_artists.csv")

In [68]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,T.I.,Trap Muzik,I Can't Quit,"[Intro]\nHuh, hell nah, I can't quit\nHell nah..."
1,T.I.,Trap Muzik,Be Easy,"[Intro]\nUh-uh, uh-uh, uh\nAye, where the pian..."
2,T.I.,Trap Muzik,No More Talk,[Verse 1]\nI'm either running for my life or I...
3,T.I.,Trap Muzik,Doin My Job,"[T.I. - talking]\nAy I'm working here, know wh..."
4,T.I.,Trap Muzik,24's,"[Intro]\nYeah\nFor all my real ATL niggas, tha..."


In [4]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]
tracks_by_artist.tail()

Unnamed: 0,artist,album,track,lyrics
751,Tech N9ne,Everready (The Religion),The Melancholy Maze & My World Intro,[Intro]\nYou have entered The Melancholy Maze\...
752,Tech N9ne,Everready (The Religion),The Beast,[Intro 1: Krizz Kaliko]\nInsanity at it's fine...
753,Tech N9ne,Everready (The Religion),Intro to the Strange Music Library,"[Tech N9ne]\nYo, what's up?\nTech N9ne here\nA..."
754,Tech N9ne,Everready (The Religion),That Owl,[Verse 1]\nHe wild out his style 'bout a mile ...
755,Tech N9ne,Everready (The Religion),In My Head,First entry to Everready: The Religion\nIt's c...


In [5]:
# sample = tracks_by_artist[tracks_by_artist["artist"].isin(["2 Chainz","A$AP Rocky"])]

In [6]:
# Helper functions

def tokenizer(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("\n", " ", raw_lyrics)
    # Ignore case    
    raw_lyrics = raw_lyrics.lower()
    word_list = raw_lyrics.split()

    return word_list

def normalise_vector(vector):
    return vector / np.sqrt(np.dot(vector,vector))

def add_vectors(vectors):
    return normalise_vector(np.sum(vectors, axis=0))

def ranked_words_by_weighting(vector, top_n):
    return sorted(zip(vectorizer.get_feature_names(), vector), key=lambda x: x[1], reverse=True)[:top_n]

def most_representative_songs(vector, tracks_by_artist, top_n):
    df = tracks_by_artist.copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[:top_n]

def most_similar_artists(vector, artist_vectors, top_n):
    df = artist_vectors.reset_index().copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[1:top_n+1]

In [7]:
# ignore for time being
# hyperparamters = [
#     {"max_df": 1.0, "min_df": 1}, # i.e. default values
#     {"max_df": 1.0, "min_df": 0.001},
#     {"max_df": 0.7, "min_df": 1},
#     {"max_df": 0.7, "min_df": 0.001}
# ]

In [8]:
# for hp in hyperparamters:

vectorizer = TfidfVectorizer(
    stop_words=ENGLISH_STOP_WORDS,
#     lowercase=False,
    max_df=0.7,
#     min_df=5,
#     ngram_range=(1,5),
#     tokenizer=tokenizer,
#     norm=None
)

tracks_by_artist_copy = tracks_by_artist.copy()
tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]

In [9]:
# train, test = train_test_split(tracks_by_artist_copy, test_size=0.3, random_state=42)

In [37]:
data_x = tracks_by_artist_copy[['normalised_vectors']].as_matrix()
data_y = tracks_by_artist_copy[['artist']].as_matrix()
stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=34)
for train_index, test_index in stratified_split.split(data_x, data_y):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

X_train = [x[0] for x in x_train.tolist()]
X_test = [x[0] for x in x_test.tolist()]

In [38]:
# y_train, X_train = train["artist"].values.tolist(), train["normalised_vectors"].values.tolist()
# y_test, X_test = test["artist"].values.tolist(), test["normalised_vectors"].values.tolist()

In [47]:
logreg = LogisticRegression(n_jobs=1, C=1e6)
logreg.fit(X_train, y_train.ravel())
y_pred = logreg.predict(X_test)

prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.8186274509803921
Testing Precision: 0.823093891861945
Testing Recall: 0.8186274509803921
Testing FScore: 0.8155731648308945
Testing Support: None


In [60]:
from sklearn import svm

svc = svm.SVC(gamma=0.01, decision_function_shape='ovr', kernel="rbf")
svc.fit(X_train, y_train.ravel())
y_pred = svc.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.14705882352941177
Testing Precision: 0.14705882352941177
Testing Recall: 1.0
Testing FScore: 0.25641025641025644
Testing Support: None


In [61]:
from sklearn import svm

# svc = svm.SVC(gamma='scale', decision_function_shape='ovo')
linear_svc = svm.LinearSVC()
linear_svc.fit(X_train, y_train.ravel())
y_pred = linear_svc.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.8137254901960784
Testing Precision: 0.8251199175218716
Testing Recall: 0.8137254901960784
Testing FScore: 0.81093557446316
Testing Support: None


In [65]:
nb = GaussianNB()
nb.fit(X_train, y_train.ravel())
y_pred = nb.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.46078431372549017
Testing Precision: 0.5540927482103953
Testing Recall: 0.46078431372549017
Testing FScore: 0.4514795579349068
Testing Support: None
