# Doc2Vec with Logistic Regression

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import cosine
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import utils

%matplotlib inline

In [2]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_for_top_10_artists.csv")

In [3]:
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]

In [4]:
tracks_by_artist.shape

(679, 4)

In [5]:
# sample = tracks_by_artist[tracks_by_artist["artist"].isin(["2 Chainz","A$AP Rocky"])]

In [6]:
# preprocess lyrics ...
def preprocess(raw_lyrics):
    # Some choices here are specific to the format of Genius lyrics, want to remove non-vocalised 
    # text in square brackets, and other text not part of the main song body such as adlibs in 
    # round brackets
    try:
        raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
        raw_lyrics = re.sub("[\n]+", " ", raw_lyrics)
        raw_lyrics = raw_lyrics.strip()
        raw_lyrics = raw_lyrics.lower()
    except TypeError:
        print(raw_lyrics)
        raw_lyrics = raw_lyrics.decode('utf-8')
        raw_lyrics = re.sub("([\(\[].*?[\)\]])|([^\w\d'\s]+)", "", raw_lyrics)
        raw_lyrics = re.sub("[\n]+", " ", raw_lyrics)
        raw_lyrics = raw_lyrics.strip()
        raw_lyrics = raw_lyrics.lower()

    return raw_lyrics

In [7]:
# train, test = train_test_split(tracks_by_artist, test_size=0.3, random_state=40)
data_x = tracks_by_artist[['lyrics']].as_matrix()
data_y = tracks_by_artist[['artist']].as_matrix()
stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=34)
for train_index, test_index in stratified_split.split(data_x, data_y):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

X_train = [x[0] for x in x_train.tolist()]
X_test = [x[0] for x in x_test.tolist()]

In [8]:
# train_artists = train["artist"].tolist()
# train_lyrics = train["lyrics"].tolist()

# test_artists = test["artist"].tolist()
# test_lyrics = test["lyrics"].tolist()
train_lyrics = X_train
train_artists = y_train

test_artists = y_test
test_lyrics = X_test

In [9]:
train_tagged = [TaggedDocument(words=word_tokenize(preprocess(train_lyrics[idx])), tags=train_artists[idx].ravel()) for idx, _ in enumerate(train_artists)]
test_tagged = [TaggedDocument(words=word_tokenize(preprocess(test_lyrics[idx])), tags=test_artists[idx].ravel()) for idx, _ in enumerate(test_artists)]

In [10]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [11]:
# model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow = Doc2Vec(dm=0, vector_size=4000, sample=0, workers=cores)
model_dbow.build_vocab(train_tagged)

In [12]:
model_dbow.corpus_count

475

In [13]:
max_epochs = 30
for epoch in range(max_epochs):
    model_dbow.train(utils.shuffle(train_tagged), total_examples=model_dbow.corpus_count, epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
#     previouse example
#     model_dbow.train(tagged_lyrics,
#                 total_examples=model.corpus_count,
#                 epochs=model.iter)
#     # decrease the learning rate
#     model.alpha -= 0.0002
#     # fix the learning rate, no decay
#     model.min_alpha = model.alpha

model_dbow.save("dbow.model")
print("Model Saved")

Model Saved


In [14]:
def vec_for_learning(model, tagged_docs):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tagged_docs])
    return targets, regressors

In [15]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [16]:
logreg = LogisticRegression(n_jobs=1, C=1e6)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.6666666666666666
Testing Precision: 0.6857108271045811
Testing Recall: 0.6666666666666666
Testing FScore: 0.663593579585521
Testing Support: None


In [None]:
# from sklearn.linear_model import LogisticRegressionCV

# clf = LogisticRegressionCV(cv=10, random_state=0, multi_class='multinomial')
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

# from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_recall_fscore_support


# print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
# print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
# print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
# print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
# print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

In [17]:
from sklearn import svm

# svc = svm.SVC(gamma='scale', decision_function_shape='ovo')
linear_svc = svm.LinearSVC()
linear_svc.fit(X_train, y_train)
y_pred = linear_svc.predict(X_test)

prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.7205882352941176
Testing Precision: 0.7436345207139008
Testing Recall: 0.7205882352941176
Testing FScore: 0.7190501544747454
Testing Support: None


In [18]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.7009803921568627
Testing Precision: 0.7143949197612768
Testing Recall: 0.7009803921568627
Testing FScore: 0.6976864636038306
Testing Support: None


In [22]:
# model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=4000, window=10, sample=0, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab(train_tagged)

In [23]:
for epoch in range(30):
    model_dmm.train(utils.shuffle(train_tagged), total_examples=len(train_tagged), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

In [24]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support


print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

Testing accuracy 0.6029411764705882
Testing Precision: 0.6542737283630216
Testing Recall: 0.6029411764705882
Testing FScore: 0.6109437354774891
Testing Support: None


In [25]:
from sklearn import svm

# svc = svm.SVC(gamma='scale', decision_function_shape='ovo')
linear_svc = svm.LinearSVC()
linear_svc.fit(X_train, y_train)
y_pred = linear_svc.predict(X_test)

prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.6176470588235294
Testing Precision: 0.6627127077207617
Testing Recall: 0.6176470588235294
Testing FScore: 0.6243248084525774
Testing Support: None


In [26]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

Testing accuracy 0.35784313725490197
Testing Precision: 0.6326949272655403
Testing Recall: 0.35784313725490197
Testing FScore: 0.28617944764436354
Testing Support: None


In [None]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [None]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [None]:
def get_vectors(model, tagged_docs):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tagged_docs])
    return targets, regressors

In [None]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[0]))
print('Testing Recall: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[1]))
print('Testing FScore: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[2]))
print('Testing Support: {}'.format(precision_recall_fscore_support(y_test, y_pred, average='weighted')[3]))

In [None]:
from sklearn import svm

# svc = svm.SVC(gamma='scale', decision_function_shape='ovo')
linear_svc = svm.LinearSVC()
linear_svc.fit(X_train, y_train)
y_pred = linear_svc.predict(X_test)

prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Handle case where label not predicted in test case
prfs = precision_recall_fscore_support(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing Precision: {}'.format(prfs[0]))
print('Testing Recall: {}'.format(prfs[1]))
print('Testing FScore: {}'.format(prfs[2]))
print('Testing Support: {}'.format(prfs[3]))