In [1]:
#pip install nltk


In [2]:
import numpy as np
import nltk  
#nltk.download("wordnet")  # only done once! we have to download the WordNet database locally

import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer   
#nltk.download('stopwords')

# very good tokenizer for english, considers sentence structure
from nltk.tokenize import TreebankWordTokenizer 
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB



In [3]:
roll_corpus = pd.read_csv("./rolling_stones_corpus.csv", index_col=0)
pink_corpus = pd.read_csv("./pink_floyd_corpus.csv", index_col=0)


In [4]:
roll_corpus.head()

Unnamed: 0,song_lines,singer
0,If you start me up,rolling_stones
1,If you start me up I'll never stop,rolling_stones
2,You can start me up,rolling_stones
3,You can start me up I'll never stop,rolling_stones
4,I've been running hot,rolling_stones


In [5]:
pink_corpus.head()

Unnamed: 0,song_lines,singer
0,"Leave when I ask you to leave, Lucy",pink_floyd
1,"Please fall away from me, Lucy",pink_floyd
2,"Oh, go, little girl",pink_floyd
3,"See that I'm so broken up about you, Lucy",pink_floyd
4,"Mean treatin' me and done me harm, Lucy",pink_floyd


In [6]:
full_corpus = pd.concat([pink_corpus, roll_corpus])
full_corpus.rename(columns={"song_lines": "lyrics", "singer": "group"}, inplace=True)
full_corpus.head()


Unnamed: 0,lyrics,group
0,"Leave when I ask you to leave, Lucy",pink_floyd
1,"Please fall away from me, Lucy",pink_floyd
2,"Oh, go, little girl",pink_floyd
3,"See that I'm so broken up about you, Lucy",pink_floyd
4,"Mean treatin' me and done me harm, Lucy",pink_floyd


In [7]:
full_corpus.shape

(2018, 2)

In [8]:
corpus_all = full_corpus.copy()

corpus = []
LABELS = []

# Create list of song lines and labels
for artist in corpus_all["group"].unique():

    # For current artist, get all song lines
    song_lines = corpus_all[corpus_all["group"] == artist]["lyrics"]
    
    # Add each line to a list (corpus)
    for line in song_lines:
        corpus.append(line)

    # For each lyric, add the artist to another list
    for i in range(len(song_lines)):
        LABELS.append(artist)

In [9]:
corpus

['Leave when I ask you to leave, Lucy',
 'Please fall away from me, Lucy',
 'Oh, go, little girl',
 "See that I'm so broken up about you, Lucy",
 "Mean treatin' me and done me harm, Lucy",
 'Been in love with you and your charms, Lucy',
 'Oh, go, little girl',
 "I'm in love with you, Lucy",
 'You got my heart, you got my heart, oh no',
 "You tear me apart, you just won't let me go",
 "You hold on so tight, so tight I just can't breathe",
 'Now Lucy leave, Lucy',
 'Leave when I ask you to leave, little girl',
 'Please fall away from me, little girl',
 'Yeah, go, little girl',
 "See that I'm so broke up about you, Lucy",
 'Yeah, girl',
 'Bo Diddley was a private eye',
 'Hand was fast and his I.Q. high',
 'D.A. said "Double O, you won\'t last',
 'Slow down boy, you living too fast"',
 'Well, Double O had a Cadillac',
 'Machine guns and a bullet-proof back',
 'Engine taken from a 707',
 'Sounds like a sure way to get into heaven',
 'In his pocket, 5 pounds of lead',
 'To keep Bo Diddley fr

In [10]:
# let's lower case everything
CORPUS = [s.lower() for s in corpus]

In [11]:
import nltk
#nltk.download('omw-1.4')
#nltk.download('corpora/omw-1.4')
#tokenize and lemmatize

CLEAN_CORPUS = []

tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()


for doc in CORPUS:
    tokens = tokenizer.tokenize(text=doc)
    clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
    CLEAN_CORPUS.append(clean_doc)

In [12]:
CLEAN_CORPUS

['leave when i ask you to leave , lucy',
 'please fall away from me , lucy',
 'oh , go , little girl',
 "see that i 'm so broken up about you , lucy",
 "mean treatin ' me and done me harm , lucy",
 'been in love with you and your charm , lucy',
 'oh , go , little girl',
 "i 'm in love with you , lucy",
 'you got my heart , you got my heart , oh no',
 "you tear me apart , you just wo n't let me go",
 "you hold on so tight , so tight i just ca n't breathe",
 'now lucy leave , lucy',
 'leave when i ask you to leave , little girl',
 'please fall away from me , little girl',
 'yeah , go , little girl',
 "see that i 'm so broke up about you , lucy",
 'yeah , girl',
 'bo diddley wa a private eye',
 'hand wa fast and his i.q. high',
 "d.a. said `` double o , you wo n't last",
 "slow down boy , you living too fast ''",
 'well , double o had a cadillac',
 'machine gun and a bullet-proof back',
 'engine taken from a 707',
 'sound like a sure way to get into heaven',
 'in his pocket , 5 pound of l

In [13]:
STOPWORDS = stopwords.words('english')

print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
# instanciation
vectorizer = TfidfVectorizer(stop_words=STOPWORDS)

# fit bag of words model on our corpus
vectors = vectorizer.fit_transform(CLEAN_CORPUS)


# for us to see the vectorized labeled data
pd.DataFrame(vectors.todense(), columns=vectorizer.get_feature_names_out(), index=LABELS) # check result of vectorization



Unnamed: 0,45,707,aaaaa,aaaaah,aaah,aching,across,admit,adrift,afternoon,...,ya,yeah,year,yes,yesterday,york,young,youth,zipping,zombie
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
vectors.todense()[0]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [17]:
model = MultinomialNB()

In [18]:
# fit the model with the vectorized data

model.fit(vectors, LABELS)  # model.fit

In [19]:
LABELS

['pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_floyd',
 'pink_flo

In [20]:
# multinomialNB usually tends to overfit

model.score(vectors, LABELS)

0.9326065411298315

In [26]:
# TEST DATA OR NEW Data

new_lyrics = ["paint it black", "another brick in the wall"]

In [27]:
# we have to apply the same steps

# I skipped over the preprocessing 

# used only transform 

new_vectors = vectorizer.transform(new_lyrics)

In [28]:
new_vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
model.predict(new_vectors)

array(['rolling_stones', 'pink_floyd'], dtype='<U14')

In [30]:
model.predict_proba(new_vectors)

array([[0.1665232 , 0.8334768 ],
       [0.60558316, 0.39441684]])