In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import matplotlib.pyplot as plt
import multiprocessing
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from pipelinehelper import PipelineHelper
import re
import seaborn as sns
from sklearn import utils
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.svm import LinearSVC

%matplotlib inline

In [2]:
tracks_by_artist = pd.read_csv("./data/tracks_with_lyrics_for_top_10_artists.csv")
tracks_by_artist = tracks_by_artist[~tracks_by_artist["lyrics"].isnull()]

In [3]:
tracks_by_artist.head()

Unnamed: 0,artist,album,track,lyrics
0,T.I.,Trap Muzik,I Can't Quit,"[Intro]\nHuh, hell nah, I can't quit\nHell nah..."
1,T.I.,Trap Muzik,Be Easy,"[Intro]\nUh-uh, uh-uh, uh\nAye, where the pian..."
2,T.I.,Trap Muzik,No More Talk,[Verse 1]\nI'm either running for my life or I...
3,T.I.,Trap Muzik,Doin My Job,"[T.I. - talking]\nAy I'm working here, know wh..."
4,T.I.,Trap Muzik,24's,"[Intro]\nYeah\nFor all my real ATL niggas, tha..."


In [14]:
tracks_by_artist["lyrics"].values[0]

'[Intro]\nHuh, hell nah, I can\'t quit\nHell nah, man, we got too much money to be getting shot up\nStay down, Grand Hustle\n\n[Verse 1]\nLook, I\'m this far from being a star and just that close to quitting\nI never should\'ve came close to missing\nBut I want to be a musician, pimping, not a politician\nListen, feel my position, it\'s a rough transition\nPlus I\'m way too deep in the game to be trying to change\nFor fortune and fame and glamour, I can be in the slammer\nOr six feet up under Atlanta\nIt\'s a blessing to breathe\nI walk the streets with seven felonies, I\'m blessed to be free\nLong as somebody up in heaven who keep blessing a G\nT.I.P.\'ll still be blessing CD\'s\nSo haters you can see these\nCause I\'m back now with something to prove\nEverything to gain, pimping, nothing to lose\nI ought to start smacking niggas when it\'s nothing to do\nThey too big shoot them in the leg and even the odds\nCause you ain\'t hard\nUnless you ran with Cap and C-Rod\nTrapped with K.T. a

In [12]:
r = re.compile("\[(.*)\]")

tracks_by_artist["lyrics"].values[0]
# m = re.findall(r"\[(.*)\]", example)
list(filter(r.match, tracks_by_artist["lyrics"].values[0]))

['[Intro]\nHuh, hell nah, I can\'t quit\nHell nah, man, we got too much money to be getting shot up\nStay down, Grand Hustle\n\n[Verse 1]\nLook, I\'m this far from being a star and just that close to quitting\nI never should\'ve came close to missing\nBut I want to be a musician, pimping, not a politician\nListen, feel my position, it\'s a rough transition\nPlus I\'m way too deep in the game to be trying to change\nFor fortune and fame and glamour, I can be in the slammer\nOr six feet up under Atlanta\nIt\'s a blessing to breathe\nI walk the streets with seven felonies, I\'m blessed to be free\nLong as somebody up in heaven who keep blessing a G\nT.I.P.\'ll still be blessing CD\'s\nSo haters you can see these\nCause I\'m back now with something to prove\nEverything to gain, pimping, nothing to lose\nI ought to start smacking niggas when it\'s nothing to do\nThey too big shoot them in the leg and even the odds\nCause you ain\'t hard\nUnless you ran with Cap and C-Rod\nTrapped with K.T. 

In [70]:
# def get_song_structure(lyrics):

# m = re.findall(r"\[([\w]+)\]", example)

# m = re.findall(r"\[(.*)\]", tracks_by_artist["lyrics"].values[0:4])
# [word.split(" ")[0] for word in m]
# for track in m:
#     for word in track:
#         print(''.join([i for i in word if not i.isdigit()]))

m = [re.findall(r"\[(.*)\]", lyrics) for lyrics in tracks_by_artist["lyrics"].values[0:4]]
print([[''.join(c for c in word if not c.isdigit()) for word in track] for track in m])

[['Intro', 'Verse ', 'Hook', 'Verse ', 'Hook', 'Verse ', 'Hook'], ['Intro', 'Verse ', 'Hook', 'Verse ', 'T.I. talking'], ['Verse ', 'Hook', 'Verse ', 'Hook'], ['T.I. - talking', 'Verse ', 'Hook', 'Verse ', 'Hook', 'Verse ', 'Hook']]


In [79]:
tracks_by_artist["lyrics"].apply(lambda lyrics: " ".join(re.findall(r"\[(.*)\]", lyrics)))

0           Intro Verse 1 Hook Verse 2 Hook Verse 3 Hook
1                Intro Verse 1 Hook Verse 2 T.I. talking
2                              Verse 1 Hook Verse 2 Hook
3      T.I. - talking Verse 1 Hook Verse 2 Hook Verse...
4      Intro Hook Verse 1 Hook Verse 2 Hook Verse 3 Hook
5                 Hook Verse 1 Hook Verse 2 Hook Verse 3
6                   Ad lib Verse One Hook Verse Two Hook
7                 Hook Verse 1 Verse 2 Hook Verse 3 Hook
8                        Intro Verse 1 Hook Verse 2 Hook
9                                        Rap Verse: T.I.
10          Verse 1 Hook Verse 2 Hook Verse 3 Hook Outro
11          Intro Verse 1 Hook Verse 2 Hook Verse 3 Hook
12                                           Intro Verse
13                       Intro Verse 1 Hook Verse 2 Hook
14             Intro Verse 1 Hook Verse 2 Hook Hook Hook
15      Intro Hook Verse 1 Hook Verse 2 Hook Bridge Hook
16     Verse 1: T.I. Hook Verse 2: T.I. Hook Verse 3:...
17     T.I. praying over indist

In [51]:
word_tokenize("Intro")

['Intro']

In [None]:
# for hp in hyperparamters:

vectorizer = TfidfVectorizer(
    stop_words=ENGLISH_STOP_WORDS,
#     lowercase=False,
    max_df=0.7,
#     min_df=5,
#     ngram_range=(1,5),
#     tokenizer=tokenizer,
#     norm=None
)

tracks_by_artist_copy = tracks_by_artist.copy()
tracks_by_artist_copy["unnormalised_vectors"] = list(vectorizer.fit_transform(tracks_by_artist_copy["lyrics"].values).toarray())
tracks_by_artist_copy["normalised_vectors"] = tracks_by_artist_copy["unnormalised_vectors"].apply(normalise_vector)
tracks_by_artist_copy = tracks_by_artist_copy[tracks_by_artist_copy["unnormalised_vectors"].apply(sum) != 0]