# TF-IDF with Additional Features for Logistic Regression

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import utils

%matplotlib inline

In [2]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_for_top_10_artists.csv")

In [3]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,T.I.,Trap Muzik,I Can't Quit,"[Intro]\nHuh, hell nah, I can't quit\nHell nah..."
1,T.I.,Trap Muzik,Be Easy,"[Intro]\nUh-uh, uh-uh, uh\nAye, where the pian..."
2,T.I.,Trap Muzik,No More Talk,[Verse 1]\nI'm either running for my life or I...
3,T.I.,Trap Muzik,Doin My Job,"[T.I. - talking]\nAy I'm working here, know wh..."
4,T.I.,Trap Muzik,24's,"[Intro]\nYeah\nFor all my real ATL niggas, tha..."


In [4]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]
tracks_by_artist.tail()

Unnamed: 0,artist,album,track,lyrics
751,Tech N9ne,Everready (The Religion),The Melancholy Maze & My World Intro,[Intro]\nYou have entered The Melancholy Maze\...
752,Tech N9ne,Everready (The Religion),The Beast,[Intro 1: Krizz Kaliko]\nInsanity at it's fine...
753,Tech N9ne,Everready (The Religion),Intro to the Strange Music Library,"[Tech N9ne]\nYo, what's up?\nTech N9ne here\nA..."
754,Tech N9ne,Everready (The Religion),That Owl,[Verse 1]\nHe wild out his style 'bout a mile ...
755,Tech N9ne,Everready (The Religion),In My Head,First entry to Everready: The Religion\nIt's c...


In [5]:
# Helper functions

def tokenizer(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("\n", " ", raw_lyrics)
    # Ignore case    
    raw_lyrics = raw_lyrics.lower()
    word_list = raw_lyrics.split()

    return word_list

def normalise_vector(vector):
    return vector / np.sqrt(np.dot(vector,vector))

def add_vectors(vectors):
    return normalise_vector(np.sum(vectors, axis=0))

def ranked_words_by_weighting(vector, top_n):
    return sorted(zip(vectorizer.get_feature_names(), vector), key=lambda x: x[1], reverse=True)[:top_n]

def most_representative_songs(vector, tracks_by_artist, top_n):
    df = tracks_by_artist.copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[:top_n]

def most_similar_artists(vector, artist_vectors, top_n):
    df = artist_vectors.reset_index().copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[1:top_n+1]

In [6]:
# ignore for time being
# hyperparamters = [
#     {"max_df": 1.0, "min_df": 1}, # i.e. default values
#     {"max_df": 1.0, "min_df": 0.001},
#     {"max_df": 0.7, "min_df": 1},
#     {"max_df": 0.7, "min_df": 0.001}
# ]

In [12]:
# for hp in hyperparamters:

vectorizer = TfidfVectorizer(
    stop_words=ENGLISH_STOP_WORDS,
    max_df=0.7,
    min_df=5,
#     tokenizer=tokenizer,
#     norm=None
)

tracks_by_artist_copy = tracks_by_artist.copy()
tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]

In [13]:
train, test = train_test_split(tracks_by_artist_copy, test_size=0.3, random_state=42)

In [14]:
len(vectorizer.get_feature_names())

3695

In [15]:
y_train, X_train = train["artist"].values.tolist(), train["normalised_vectors"].values.tolist()
y_test, X_test = test["artist"].values.tolist(), test["normalised_vectors"].values.tolist()

In [16]:
logreg = LogisticRegression(n_jobs=1, C=1e6)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

Testing accuracy 0.8578431372549019
Testing Precision: 0.8639245761676186
Testing Recall: 0.8578431372549019
Testing FScore: 0.8549705521894324
Testing Support: None


In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
class LyricsExtractor(BaseEstimator, TransformerMixin):

#     def __init__(self, key="lyrics"):
#         self.key = key
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, train):
        return train["lyrics"].values

In [57]:
class LineCountExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data["lyrics"].str.split("\n").apply(len)

In [58]:
class AverageLineLengthExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data["lyrics"].str.split("\n").apply(lambda lines: sum(map(len,lines))/len(lines))

In [111]:
class UniqueWordProportionExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data["lyrics"].apply(lambda lyrics: self.prop_unique_words(lyrics))
    
    def prop_unique_words(self, lyrics):
        return self.get_unique_word_count(lyrics) / self.get_total_word_count(lyrics)
    
    def get_total_word_count(self, lyrics):
        return len(re.split(" |\n|", lyrics))
    
    def get_unique_word_count(self, lyrics):
        return len(set(re.split(" |\n|", lyrics)))

In [112]:
class ArrayCaster(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return np.transpose(np.matrix(data))

In [118]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# pipeline = Pipeline([
#         ("union", FeatureUnion(
#             transformer_list = [
#                 ("song_vectors", Pipeline([
#                     ("lyrics", LyricsExtractor()),
#                     ("tfidf", TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS))
#                 ])),
#                 ("line_count", Pipeline([
#                     ("get_line_count", LineCountExtractor()),
#                     ("caster", ArrayCaster())
#                 ])),
#                 ("average_line_length", Pipeline([
#                     ("get_average_line_length", AverageLineLengthExtractor()),
#                     ("caster", ArrayCaster())
#                 ])),
#                 ("unique_word_proportion", Pipeline([
#                     ("get_unique_word_proportion", UniqueWordProportionExtractor()),
#                     ("caster", ArrayCaster())
#                 ])),
#             ]
#             need to perform grid search with these weights
#             transformer_weights={
#                 "song_vectors": 1.0,
#                 "line_count": 1.0,
#                 "average_line_length": 1.0,
#                 "unique_word_proportion": 1.0
#             },
#         )),
#         ("logreg", LogisticRegression(n_jobs=1, C=1e6))
#         ("logreg", LogisticRegression())
#     ])

song_vect = Pipeline([
                    ("lyrics", LyricsExtractor()),
                    ("tfidf", TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS))
                ])
# line_count = Pipeline([
#                     ("get_line_count", LineCountExtractor()),
#                     ("caster", ArrayCaster())
#                 ])

# featureunionvect = FeatureUnion([("line_count", line_count), ("song_vect", song_vect)])
featureunionvect = FeatureUnion([("song_vect", song_vect)])

classifier = LogisticRegression()
pipeline = Pipeline([('vect', featureunionvect), ('classifier', classifier)])

# change these
parameters = {
#     "vect__transformer_list": [
#         {"song_vectors":1,"line_count":0}, 
#         {"song_vectors":0,"line_count":1}, 
#         {"song_vectors":1,"line_count":1}
#     ]
}
gs_clf = GridSearchCV(pipeline, parameters)

# dataset
tracks_by_artist_copy = tracks_by_artist.copy()
train, test = train_test_split(tracks_by_artist_copy, test_size=0.3, random_state=42)
y_train, X_train = train["artist"], train
y_test, X_test = test["artist"], test

# classifier
gs_clf.fit(X_train, y_train)

#evaluation
for score in gs_clf.grid_scores_:
    print("gridsearch scores", score)

# y_pred = pipeline.predict(X_test)

# from sklearn.metrics import accuracy_score, f1_score
# from sklearn.metrics import precision_recall_fscore_support

# print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
# print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
# print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
# print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
# print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

gridsearch scores mean: 0.64632, std: 0.01673, params: {}


In [None]:
# len(y_train) # 475
# len(X_train) # 475
# len(y_test) # 204
# len(X_test) # 204