# TF-IDF with Additional Features for Logistic Regression

In [135]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [136]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import utils

%matplotlib inline

In [2]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_for_top_10_artists.csv")

In [3]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,T.I.,Trap Muzik,I Can't Quit,"[Intro]\nHuh, hell nah, I can't quit\nHell nah..."
1,T.I.,Trap Muzik,Be Easy,"[Intro]\nUh-uh, uh-uh, uh\nAye, where the pian..."
2,T.I.,Trap Muzik,No More Talk,[Verse 1]\nI'm either running for my life or I...
3,T.I.,Trap Muzik,Doin My Job,"[T.I. - talking]\nAy I'm working here, know wh..."
4,T.I.,Trap Muzik,24's,"[Intro]\nYeah\nFor all my real ATL niggas, tha..."


In [4]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]
tracks_by_artist.tail()

Unnamed: 0,artist,album,track,lyrics
751,Tech N9ne,Everready (The Religion),The Melancholy Maze & My World Intro,[Intro]\nYou have entered The Melancholy Maze\...
752,Tech N9ne,Everready (The Religion),The Beast,[Intro 1: Krizz Kaliko]\nInsanity at it's fine...
753,Tech N9ne,Everready (The Religion),Intro to the Strange Music Library,"[Tech N9ne]\nYo, what's up?\nTech N9ne here\nA..."
754,Tech N9ne,Everready (The Religion),That Owl,[Verse 1]\nHe wild out his style 'bout a mile ...
755,Tech N9ne,Everready (The Religion),In My Head,First entry to Everready: The Religion\nIt's c...


In [5]:
# Helper functions

def tokenizer(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
    raw_lyrics = re.sub("\n", " ", raw_lyrics)
    # Ignore case    
    raw_lyrics = raw_lyrics.lower()
    word_list = raw_lyrics.split()

    return word_list

def normalise_vector(vector):
    return vector / np.sqrt(np.dot(vector,vector))

def add_vectors(vectors):
    return normalise_vector(np.sum(vectors, axis=0))

def ranked_words_by_weighting(vector, top_n):
    return sorted(zip(vectorizer.get_feature_names(), vector), key=lambda x: x[1], reverse=True)[:top_n]

def most_representative_songs(vector, tracks_by_artist, top_n):
    df = tracks_by_artist.copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[:top_n]

def most_similar_artists(vector, artist_vectors, top_n):
    df = artist_vectors.reset_index().copy()
    df["dist"] = df["normalised_vectors"].apply(lambda target: cosine(vector, target))
    df.sort_values("dist", inplace=True)
    return df[1:top_n+1]

In [6]:
# ignore for time being
# hyperparamters = [
#     {"max_df": 1.0, "min_df": 1}, # i.e. default values
#     {"max_df": 1.0, "min_df": 0.001},
#     {"max_df": 0.7, "min_df": 1},
#     {"max_df": 0.7, "min_df": 0.001}
# ]

In [12]:
# for hp in hyperparamters:

vectorizer = TfidfVectorizer(
    stop_words=ENGLISH_STOP_WORDS,
    max_df=0.7,
    min_df=5,
#     tokenizer=tokenizer,
#     norm=None
)

tracks_by_artist_copy = tracks_by_artist.copy()
tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]

In [13]:
train, test = train_test_split(tracks_by_artist_copy, test_size=0.3, random_state=42)

In [14]:
len(vectorizer.get_feature_names())

3695

In [15]:
y_train, X_train = train["artist"].values.tolist(), train["normalised_vectors"].values.tolist()
y_test, X_test = test["artist"].values.tolist(), test["normalised_vectors"].values.tolist()

In [141]:
logreg = LogisticRegression(n_jobs=1, C=1e6)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

ValueError: could not convert string to float: '[Intro]\nYeah\nYeah, yeah, yeah, yeah, yeah, yeah\nOkay\nAlright, a\'ight, a\'ight, a\'ight, a\'ight\nYo, yo\nAlright, I\'mma lay the chorus first\nHere we go now\n\n[Chorus]\nMy mom loved Valium and lots of drugs\nThat\'s why I am like I am \'cause I\'m like her\nBecause my mom loved Valium and lots of drugs\nThat\'s why I\'m on what I\'m on \'cause I\'m my mom\n\n[Verse 1]\nMy mom, my mom, I know you\'re probably tired\nOf hearing \'bout my mom, oh-ho, whoa-ho\nBut this is just a story of when I was just a shorty\nAnd how I became hooked on Va-aliu-um\nValium was in everything, food that I ate\nThe water that I drank, fuckin\' peas on my plate\nShe sprinkled just enough of it to season my steak\nSo every day I\'d have at least three stomachaches\nNow tell me, what kind of mother would want to see her\nSon grow up to be an undera-fuckin\'-chiever?\nMy teacher didn\'t think I was gonna be nothin\' either\n"What the fuck you stickin\' gum up under the fucking seat for?"\n"Mrs. Mathers, your son has been huffing ether\nEither that or the motherfucker\'s been puffin\' reefer"\nBut all this huffin\' and puffin\' wasn\'t what it was either\nIt was neither, I was buzzing but it wasn\'t what she thought\nPee in a teacup? Bitch, you ain\'t my keeper, I\'m sleeping\nWhat the fuck you keep on fucking with me for?\nSlut, you need to leave me the fuck alone, I ain\'t playin\'\nGo find you a white crayon and color a fucking zebra\n\n[Chorus]\nMy mom loved Valium and lots of drugs\nThat\'s why I am like I am \'cause I\'m like her\nBecause my mom loved Valium and lots of drugs\nThat\'s why I\'m on what I\'m on \'cause I\'m my mom\n\n[Verse 2]\nWait a minute, this ain\'t dinner, this is paint thinner\n"You ate it yesterday, I ain\'t hear no complaints, did I?\nNow here\'s a plate full of painkillers\nNow just wait \'til I crush the Valium and put it in your potatoes\nYou little motherfucker, I\'ll make you sit there and make\nThat retarded fucking face without even tasting it\nYou better lick the fucking plate, you ain\'t wasting it\nPut your face in it \'fore I throw you in the basement again\nAnd I ain\'t givin\' in, you\'re gonna just sit there\nIn one fucking place, finnickin\' \'til next Thanksgiving\nAnd if you still ain\'t finished it I\'ll use the same shit again\nThen when I make spinach dip it\'ll be placed in the shit\nYou little shit, wanna sit there and play innocent?\nA rack fell and hit me in K-Mart and they witnessed it\nChild support, your father, he ain\'t sent the shit\nAnd so what if he did? It\'s none of your dang business, kid"\n\n[Bridge 1]\nMy mom\nThere\'s no one else quite like my mom\nI know I should let bygones be bygones\nBut she\'s the reason why I am high what I\'m high on\n\n[Chorus]\n\'Cause my mom loved Valium and lots of drugs\nThat\'s why I am like I am \'cause I\'m like her\nBecause my mom loved Valium and lots of drugs\nThat\'s why I\'m on what I\'m on \'cause I\'m my mom\n\n[Bridge 2]\nMy mom loved Valium, now all I am\nIs a party animal, I am what I am\nBut I\'m strong to the finish with me Valium spinach\nBut my buzz only lasts about two minutes\nBut I don\'t wanna swallow it without chewin\' it\nI can\'t even write a rhyme without you in it\nMy Valium, my Vaaaaaa-liummmm, ohh\n\n[Verse 3]\nMan, I never thought that I could ever be\nA drug addict, naw, fuck that, I can\'t have it happen to me\nBut that\'s actually what has ended up happening\nA tragedy, the fucking past ended up catching me\nAnd it\'s probably where I got acquainted with the taste, ain\'t it?\nPharmaceuticals are the bomb, Mom, beautiful\nShe killed the fuckin\' dog with the medicine she done fed it\nFeed it a fuckin\' aspirin and say that it has a headache\n"Here, want a snack? You hungry, you fuckin\' brat?\nLook at that, it\'s a Xanax, take it and take a nap, eat it"\nBut I don\'t need it "Well fuck it then, break it up\nTake a little piece and beat it before you wake Nathan up"\nAlright Ma, you win, I don\'t feel like arguin\'\nI\'ll do it, pop and gobble it and start wobblin\'\nStumble, hobble, tumble, slip, trip, then I fall in bed\nWith a bottle of meds and a Heath Ledger bobblehead\n\n[Chorus]\nMy mom loved Valium and lots of drugs\nThat\'s why I am like I am \'cause I\'m like her\nBecause my mom loved Valium and lots of drugs\nThat\'s why I\'m on what I\'m on \'cause I\'m my mom\n\n[Outro]\nMy mom, I\'m just like her\nMy mom, my mom, my mom\nMy mom, my mom, my mom\nMy mom, my mom, my mom\nMy mom, my momma\nMe momma, I like-a da momma\nHaha, sorry Mom\nI still love you though\nDr. Dre, 2010\nAy, this shit is hella hard, homie\nYo, take us on outta here\nWoo!'

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
class LyricsExtractor(BaseEstimator, TransformerMixin):

#     def __init__(self, key="lyrics"):
#         self.key = key
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, train):
        return train["lyrics"].values

In [57]:
class LineCountExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data["lyrics"].str.split("\n").apply(len)

In [58]:
class AverageLineLengthExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data["lyrics"].str.split("\n").apply(lambda lines: sum(map(len,lines))/len(lines))

In [111]:
class UniqueWordProportionExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data["lyrics"].apply(lambda lyrics: self.prop_unique_words(lyrics))
    
    def prop_unique_words(self, lyrics):
        return self.get_unique_word_count(lyrics) / self.get_total_word_count(lyrics)
    
    def get_total_word_count(self, lyrics):
        return len(re.split(" |\n|", lyrics))
    
    def get_unique_word_count(self, lyrics):
        return len(set(re.split(" |\n|", lyrics)))

In [112]:
class ArrayCaster(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return np.transpose(np.matrix(data))

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

song_vect = Pipeline([
                    ("lyrics", LyricsExtractor()),
                    ("tfidf", TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS))
                ])

line_count = Pipeline([
                    ("get_line_count", LineCountExtractor()),
                    ("caster", ArrayCaster())
                ])

average_line_length = Pipeline([
                    ("get_average_line_length", AverageLineLengthExtractor()),
                    ("caster", ArrayCaster())
                ])

unique_word_proportion = Pipeline([
                    ("get_unique_word_proportion", UniqueWordProportionExtractor()),
                    ("caster", ArrayCaster())
                ])

featureunionvect = FeatureUnion([
    ("line_count", line_count),
    ("song_vect", song_vect),
    ("average_line_length", average_line_length),
    ("unique_word_proportion", unique_word_proportion)
])

classifier = LogisticRegression()
pipeline = Pipeline([('vect', featureunionvect), ('classifier', classifier)])

# change these
parameters = {
    "vect__song_vect__tfidf__max_df": [0.7, 0.8, 1.0],
    "vect__song_vect__tfidf__min_df": [1, 5, 20],
    "vect__transformer_weights": [
        # one feature only
        {"song_vectors":0,"line_count":0, "average_line_length":0, "unique_word_proportion":1},#0001
        {"song_vectors":0,"line_count":0, "average_line_length":1, "unique_word_proportion":0},#0010
        {"song_vectors":0,"line_count":1, "average_line_length":0, "unique_word_proportion":0},#0100
        {"song_vectors":1,"line_count":0, "average_line_length":0, "unique_word_proportion":0},#1000
        
        # two features
        {"song_vectors":0,"line_count":0, "average_line_length":1, "unique_word_proportion":1},#0011
        {"song_vectors":0,"line_count":1, "average_line_length":0, "unique_word_proportion":1},#0101
        {"song_vectors":0,"line_count":1, "average_line_length":1, "unique_word_proportion":0},#0110
        {"song_vectors":0,"line_count":0, "average_line_length":1, "unique_word_proportion":1},#1001
        {"song_vectors":0,"line_count":0, "average_line_length":1, "unique_word_proportion":1},#1010
        {"song_vectors":0,"line_count":0, "average_line_length":1, "unique_word_proportion":1},#1100
        
        # three features
        {"song_vectors":0,"line_count":1, "average_line_length":1, "unique_word_proportion":1},#0111
        {"song_vectors":1,"line_count":0, "average_line_length":1, "unique_word_proportion":1},#1011
        {"song_vectors":1,"line_count":1, "average_line_length":0, "unique_word_proportion":1},#1101
        {"song_vectors":1,"line_count":1, "average_line_length":1, "unique_word_proportion":0},#1110
        
        # all features
        {"song_vectors":1,"line_count":1, "average_line_length": 1, "unique_word_proportion": 1},#1111
    ],
    "classifier__C": [1e5, 1e6, 1e7]
}
gs_clf = GridSearchCV(pipeline, parameters, cv=5)

# dataset
tracks_by_artist_copy = tracks_by_artist.copy()
train, test = train_test_split(tracks_by_artist_copy, test_size=0.3, random_state=42)
y_train, X_train = train["artist"], train
y_test, X_test = test["artist"], test

# classifier
gs_clf.fit(X_train, y_train)

#evaluation
for score in gs_clf.grid_scores_:
    print("gridsearch scores", score)

print("--------------------")
print("gridsearch best params", gs_clf.best_params_)

# y_pred = pipeline.predict(X_test)

# from sklearn.metrics import accuracy_score, f1_score
# from sklearn.metrics import precision_recall_fscore_support

# print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
# print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
# print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
# print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
# print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

In [None]:
# len(y_train) # 475
# len(X_train) #Â 475
# len(y_test) # 204
# len(X_test) # 204