In [None]:
#pip install nltk


In [115]:
import numpy as np
import nltk  
#nltk.download("wordnet")  # only done once! we have to download the WordNet database locally

import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer   
#nltk.download('stopwords')

# very good tokenizer for english, considers sentence structure
from nltk.tokenize import TreebankWordTokenizer 
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB



In [116]:
roll_corpus = pd.read_csv("./rolling_stones_corpus.csv", index_col=0)
pink_corpus = pd.read_csv("./pink_floyd_corpus.csv", index_col=0)


In [117]:
roll_corpus.head()

Unnamed: 0,song_lines,singer
0,If you start me up,rolling_stones
1,If you start me up I'll never stop,rolling_stones
2,You can start me up,rolling_stones
3,You can start me up I'll never stop,rolling_stones
4,I've been running hot,rolling_stones


In [118]:
pink_corpus.head()

Unnamed: 0,song_lines,singer
0,If you start me up,pink_floyd
1,If you start me up I'll never stop,pink_floyd
2,You can start me up,pink_floyd
3,You can start me up I'll never stop,pink_floyd
4,I've been running hot,pink_floyd


In [119]:
full_corpus = pd.concat([roll_corpus, pink_corpus])
full_corpus.rename(columns={"song_lines": "lyrics", "singer": "group"}, inplace=True)
full_corpus.head()


Unnamed: 0,lyrics,group
0,If you start me up,rolling_stones
1,If you start me up I'll never stop,rolling_stones
2,You can start me up,rolling_stones
3,You can start me up I'll never stop,rolling_stones
4,I've been running hot,rolling_stones


In [120]:
full_corpus.shape

(3274, 2)

In [121]:
corpus_all = full_corpus.copy()

corpus = []
LABELS = []

# Create list of song lines and labels
for artist in corpus_all["group"].unique():

    # For current artist, get all song lines
    song_lines = corpus_all[corpus_all["group"] == artist]["lyrics"]
    
    # Add each line to a list (corpus)
    for line in song_lines:
        corpus.append(line)

    # For each lyric, add the artist to another list
    for i in range(len(song_lines)):
        LABELS.append(artist)

In [122]:
corpus

['If you start me up',
 "If you start me up I'll never stop",
 'You can start me up',
 "You can start me up I'll never stop",
 "I've been running hot",
 'You got me ticking going to blow my top',
 'If you start me up',
 "If you start me up I'll never stop",
 'Never stop, never stop, never stop',
 'You make a grown man cry',
 'You make a grown man cry',
 'You make a grown man cry',
 'Spread out the oil, the gasoline',
 'I walk smooth, ride in a mean, mean machine',
 'Start it up',
 'If you start it up',
 'Kick on the starter',
 'Give it all you got, you got, you got',
 "I can't compete with the riders in the other heats",
 'If you rough it up',
 'If you like it, I can slide it up',
 'Slide it up, slide it up, slide it up',
 "Don't make a grown man cry",
 "Don't make a grown man cry",
 "Don't make a grown man cry",
 'My eyes dilate, my lips go green',
 'My hands are greasy',
 "She's a mean, mean machine",
 'Start it up',
 'Start me up',
 'Ah, give it all you got',
 'You got to never, nev

In [123]:
# let's lower case everything
CORPUS = [s.lower() for s in corpus]

In [124]:
import nltk
#nltk.download('omw-1.4')
#nltk.download('corpora/omw-1.4')
#tokenize and lemmatize

CLEAN_CORPUS = []

tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()


for doc in CORPUS:
    tokens = tokenizer.tokenize(text=doc)
    clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
    CLEAN_CORPUS.append(clean_doc)

In [125]:
CLEAN_CORPUS

['if you start me up',
 "if you start me up i 'll never stop",
 'you can start me up',
 "you can start me up i 'll never stop",
 "i 've been running hot",
 'you got me ticking going to blow my top',
 'if you start me up',
 "if you start me up i 'll never stop",
 'never stop , never stop , never stop',
 'you make a grown man cry',
 'you make a grown man cry',
 'you make a grown man cry',
 'spread out the oil , the gasoline',
 'i walk smooth , ride in a mean , mean machine',
 'start it up',
 'if you start it up',
 'kick on the starter',
 'give it all you got , you got , you got',
 "i ca n't compete with the rider in the other heat",
 'if you rough it up',
 'if you like it , i can slide it up',
 'slide it up , slide it up , slide it up',
 "do n't make a grown man cry",
 "do n't make a grown man cry",
 "do n't make a grown man cry",
 'my eye dilate , my lip go green',
 'my hand are greasy',
 "she 's a mean , mean machine",
 'start it up',
 'start me up',
 'ah , give it all you got',
 'you 

In [126]:
STOPWORDS = stopwords.words('english')

print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [127]:
# instanciation
vectorizer = TfidfVectorizer(stop_words=STOPWORDS)

# fit bag of words model on our corpus
vectors = vectorizer.fit_transform(CLEAN_CORPUS)


# for us to see the vectorized labeled data
pd.DataFrame(vectors.todense(), columns=vectorizer.get_feature_names_out(), index=LABELS) # check result of vectorization



Unnamed: 0,aaah,aching,across,admit,adrift,ah,ai,air,alive,alleyway,...,ya,yeah,year,yes,yesterday,york,young,youth,zipping,zombie
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rolling_stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pink_floyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [129]:
vectors.todense()[0]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [130]:
model = MultinomialNB()

In [131]:
# fit the model with the vectorized data

model.fit(vectors, LABELS)  # model.fit

In [132]:
LABELS

['rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_stones',
 'rolling_st

In [133]:
# multinomialNB usually tends to overfit

model.score(vectors, LABELS)

0.5

In [134]:
# TEST DATA OR NEW Data

new_lyrics = ["You make a grown man cry", "My eyes dilate"]

In [135]:
# we have to apply the same steps

# I skipped over the preprocessing 

# used only transform 

new_vectors = vectorizer.transform(new_lyrics)

In [136]:
new_vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [137]:
model.predict(new_vectors)

array(['pink_floyd', 'pink_floyd'], dtype='<U14')

In [138]:
model.predict_proba(new_vectors)

array([[0.5, 0.5],
       [0.5, 0.5]])