# POS-Tagging with Logistic Regression

In [84]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import utils

%matplotlib inline

In [21]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_for_top_10_artists.csv")

In [22]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]

In [72]:
# Helper functions

def tokenizer(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    tokens = nltk.word_tokenize(raw_lyrics)
    
    # remove stop words
    tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
    
    tagged_tokens = nltk.pos_tag(tokens)
    token_list = ["_".join(el) for el in tagged_tokens]

    return token_list

def normalise_vector(vector):
    return vector / np.sqrt(np.dot(vector,vector))

def add_vectors(vectors):
    return normalise_vector(np.sum(vectors, axis=0))

def ranked_words_by_weighting(vector, top_n):
    return sorted(zip(vectorizer.get_feature_names(), vector), key=lambda x: x[1], reverse=True)[:top_n]

def most_representative_songs(vector, tracks_by_artist, top_n):
    df = tracks_by_artist.copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[:top_n]

def most_similar_artists(vector, artist_vectors, top_n):
    df = artist_vectors.reset_index().copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[1:top_n+1]

In [78]:
# for hp in hyperparamters:

vectorizer = TfidfVectorizer(
#     lowercase=False,
    max_df=0.7, # <- this is used in successful tf-idf vectoriser
#     min_df=5,
#     ngram_range=(1,5),
    tokenizer=tokenizer,
#     norm=None
)

tracks_by_artist_copy = tracks_by_artist.copy()
tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]

In [79]:
data_x = tracks_by_artist_copy[['normalised_vectors']].as_matrix()
data_y = tracks_by_artist_copy[['artist']].as_matrix()
stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=34)
for train_index, test_index in stratified_split.split(data_x, data_y):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

X_train = [x[0] for x in x_train.tolist()]
X_test = [x[0] for x in x_test.tolist()]

In [80]:
logreg = LogisticRegression(n_jobs=1, C=1e6)
logreg.fit(X_train, y_train.ravel())
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.6813725490196079
Testing Precision: 0.696865674374325
Testing Recall: 0.6813725490196079
Testing FScore: 0.671243427628636
Testing Support: None


In [85]:
from sklearn import svm

# svc = svm.SVC(gamma='scale', decision_function_shape='ovo')
linear_svc = svm.LinearSVC()
linear_svc.fit(X_train, y_train.ravel())
y_pred = linear_svc.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.696078431372549
Testing Precision: 0.7139386808504457
Testing Recall: 0.696078431372549
Testing FScore: 0.6857084321961219
Testing Support: None


In [86]:
nb = MultinomialNB()
nb.fit(X_train, y_train.ravel())
y_pred = nb.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.38235294117647056
Testing Precision: 0.5880872943389913
Testing Recall: 0.6141732283464567
Testing FScore: 0.462120193535331
Testing Support: None
