# Assignment 2. Text analysis

## Setup environment

In [1488]:
import string
from random import Random
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

rng = Random()
data = pd.read_csv("dataset.csv")

[nltk_data] Downloading package omw-1.4 to /home/xgodness/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/xgodness/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/xgodness/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/xgodness/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Training the model

### Quick glance on the data

In [1489]:
data.head()

Unnamed: 0,genre,lyrics,SongInfo
0,Christian,"Who am I, that the Lord of all the earth Woul...",CASTING CROWNS - WHO AM I LYRICS
1,Christian,Glory Revealed By His Wounds He was pierced ...,GLORY REVEALED - BY HIS WOUNDS LYRICS
2,Christian,Lord of heaven and earth Lord of all creation...,CAEDMON'S CALL - GOD OF WONDERS LYRICS
3,Christian,I can only imagine what it will be like When ...,MERCYME - I CAN ONLY IMAGINE LYRICS
4,Christian,I am not skilled to understand What God has w...,AARON SHUST - MY SAVIOR MY GOD LYRICS


In [1490]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558 entries, 0 to 557
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genre     558 non-null    object
 1   lyrics    558 non-null    object
 2   SongInfo  558 non-null    object
dtypes: object(3)
memory usage: 13.2+ KB


In [1491]:
all_columns = data[['genre', 'lyrics']]
all_columns

Unnamed: 0,genre,lyrics
0,Christian,"Who am I, that the Lord of all the earth Woul..."
1,Christian,Glory Revealed By His Wounds He was pierced ...
2,Christian,Lord of heaven and earth Lord of all creation...
3,Christian,I can only imagine what it will be like When ...
4,Christian,I am not skilled to understand What God has w...
...,...,...
553,R&B,"Ha I dont care ha, about your past I just wan..."
554,R&B,Hoverin by my suitcase Tryin to find a warm ...
555,R&B,I dont know why I love you like I do After a...
556,R&B,"C. C. Rider Elvis Presley Well now see., C. ..."


In [1492]:
genres = all_columns['genre'].unique()
print(genres)

['Christian' 'Country' 'Hip-Hop' 'Pop' 'Rock' 'R&B']


In [1493]:
all_columns['genre'].value_counts()

Pop          100
Rock          95
Christian     94
Hip-Hop       91
R&B           91
Country       87
Name: genre, dtype: int64

### Picking genres

In [1494]:
rng.seed(1337)
first = rng.choice(genres)
rng.seed(7331)
second = rng.choice(genres)
print("{} and {}".format(first, second))

Rock and Hip-Hop


### Remove redundant rows

In [1495]:
columns = all_columns[(all_columns["genre"] == "Rock") | (all_columns["genre"] == "Hip-Hop")]

### Normalising data

In [1496]:
# normalised = columns["lyrics"].str.lower()
# ^^^ do not remove punctuation
normalised = columns["lyrics"].str.replace('[{}]'.format(string.punctuation), '', regex=True).str.lower()
# ^^^ remove punctuation

columns["normalised"] = normalised
columns["normalised"]

181     yet our best trained best educated best equip...
182     backstroke lover always hidin neath the cover...
183     intro  fuck all yall hoes  get a grip motherf...
184     one two three and to the fo  snoop doggy dogg...
185     you are now about to witness the strength of ...
                             ...                        
462     cant explain all the feelings that youre maki...
463     one foot on the brake and one on the gas hey ...
464     carry on my wayward son therell be peace when...
465     ooh yeah turn it up come on  im working hard ...
466     out on the road for forty days last night in ...
Name: normalised, Length: 186, dtype: object

### Tokenizing data

In [1497]:
columns["tokened"] = columns.apply(lambda row: nltk.word_tokenize(row['normalised']), axis=1)
columns["tokened"]

181    [yet, our, best, trained, best, educated, best...
182    [backstroke, lover, always, hidin, neath, the,...
183    [intro, fuck, all, yall, hoes, get, a, grip, m...
184    [one, two, three, and, to, the, fo, snoop, dog...
185    [you, are, now, about, to, witness, the, stren...
                             ...                        
462    [cant, explain, all, the, feelings, that, your...
463    [one, foot, on, the, brake, and, one, on, the,...
464    [carry, on, my, wayward, son, therell, be, pea...
465    [ooh, yeah, turn, it, up, come, on, im, workin...
466    [out, on, the, road, for, forty, days, last, n...
Name: tokened, Length: 186, dtype: object

### Removing stop-words

In [1498]:
noise = stopwords.words("english")
noiseless = columns["tokened"].apply(lambda x: [item for item in x if item not in noise])
noiseless_col = [", ".join(w) for w in noiseless]
columns["noiseless"] = noiseless_col
columns["noiseless"]

181    yet, best, trained, best, educated, best, equi...
182    backstroke, lover, always, hidin, neath, cover...
183    intro, fuck, yall, hoes, get, grip, motherfuck...
184    one, two, three, fo, snoop, doggy, dogg, dr, d...
185    witness, strength, street, knowledge, 1, ice, ...
                             ...                        
462    cant, explain, feelings, youre, making, feel, ...
463    one, foot, brake, one, gas, hey, well, theres,...
464    carry, wayward, son, therell, peace, done, lay...
465    ooh, yeah, turn, come, im, working, hard, your...
466    road, forty, days, last, night, little, rock, ...
Name: noiseless, Length: 186, dtype: object

### Lemmatisation

In [1499]:
lemmatiser = WordNetLemmatizer()
lemmatised = columns["noiseless"].apply(lambda x: [lemmatiser.lemmatize(x)])
lemmatised_col = [", ".join(w) for w in lemmatised]
columns["lemmatised"] = lemmatised_col
columns["lemmatised"]

181    yet, best, trained, best, educated, best, equi...
182    backstroke, lover, always, hidin, neath, cover...
183    intro, fuck, yall, hoes, get, grip, motherfuck...
184    one, two, three, fo, snoop, doggy, dogg, dr, d...
185    witness, strength, street, knowledge, 1, ice, ...
                             ...                        
462    cant, explain, feelings, youre, making, feel, ...
463    one, foot, brake, one, gas, hey, well, theres,...
464    carry, wayward, son, therell, peace, done, lay...
465    ooh, yeah, turn, come, im, working, hard, your...
466    road, forty, days, last, night, little, rock, ...
Name: lemmatised, Length: 186, dtype: object

### Result

In [1500]:
columns

Unnamed: 0,genre,lyrics,normalised,tokened,noiseless,lemmatised
181,Hip-Hop,"Yet our best trained, best educated Best equi...",yet our best trained best educated best equip...,"[yet, our, best, trained, best, educated, best...","yet, best, trained, best, educated, best, equi...","yet, best, trained, best, educated, best, equi..."
182,Hip-Hop,Backstroke lover always hidin neath the cover...,backstroke lover always hidin neath the cover...,"[backstroke, lover, always, hidin, neath, the,...","backstroke, lover, always, hidin, neath, cover...","backstroke, lover, always, hidin, neath, cover..."
183,Hip-Hop,[Intro] Fuck all yall hoes Get a grip mothe...,intro fuck all yall hoes get a grip motherf...,"[intro, fuck, all, yall, hoes, get, a, grip, m...","intro, fuck, yall, hoes, get, grip, motherfuck...","intro, fuck, yall, hoes, get, grip, motherfuck..."
184,Hip-Hop,"One, two, three and to the fo Snoop Doggy Do...",one two three and to the fo snoop doggy dogg...,"[one, two, three, and, to, the, fo, snoop, dog...","one, two, three, fo, snoop, doggy, dogg, dr, d...","one, two, three, fo, snoop, doggy, dogg, dr, d..."
185,Hip-Hop,You are now about to witness the strength of ...,you are now about to witness the strength of ...,"[you, are, now, about, to, witness, the, stren...","witness, strength, street, knowledge, 1, ice, ...","witness, strength, street, knowledge, 1, ice, ..."
...,...,...,...,...,...,...
462,Rock,Cant explain all the feelings that youre maki...,cant explain all the feelings that youre maki...,"[cant, explain, all, the, feelings, that, your...","cant, explain, feelings, youre, making, feel, ...","cant, explain, feelings, youre, making, feel, ..."
463,Rock,"One foot on the brake and one on the gas, hey...",one foot on the brake and one on the gas hey ...,"[one, foot, on, the, brake, and, one, on, the,...","one, foot, brake, one, gas, hey, well, theres,...","one, foot, brake, one, gas, hey, well, theres,..."
464,Rock,Carry on my wayward son Therell be peace when...,carry on my wayward son therell be peace when...,"[carry, on, my, wayward, son, therell, be, pea...","carry, wayward, son, therell, peace, done, lay...","carry, wayward, son, therell, peace, done, lay..."
465,Rock,"Ooh yeah Turn it up Come on Im working hard,...",ooh yeah turn it up come on im working hard ...,"[ooh, yeah, turn, it, up, come, on, im, workin...","ooh, yeah, turn, come, im, working, hard, your...","ooh, yeah, turn, come, im, working, hard, your..."


### Data segmentation and training

In [1501]:
x_train, x_test, y_train, y_test = train_test_split(columns.lemmatised, columns.genre, train_size = 0.7)
columns.genre.value_counts()

Rock       95
Hip-Hop    91
Name: genre, dtype: int64

### Vectorisation

In [1502]:
vectoriser = CountVectorizer(ngram_range=(1, 3))
vectorised_x_train = vectoriser.fit_transform(x_train)

### Classification

In [1503]:
clf = MultinomialNB()
clf.fit(vectorised_x_train, y_train)

In [1504]:
vectorised_x_test = vectoriser.transform(x_test)
clf.predict(vectorised_x_test)

array(['Hip-Hop', 'Rock', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Rock',
       'Hip-Hop', 'Hip-Hop', 'Rock', 'Rock', 'Hip-Hop', 'Hip-Hop', 'Rock',
       'Rock', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Rock',
       'Hip-Hop', 'Hip-Hop', 'Rock', 'Hip-Hop', 'Hip-Hop', 'Rock',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Rock', 'Rock', 'Hip-Hop',
       'Hip-Hop'], dtype='<U7')

In [1505]:
pred = clf.predict(vectorised_x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

     Hip-Hop       0.60      1.00      0.75        27
        Rock       1.00      0.38      0.55        29

    accuracy                           0.68        56
   macro avg       0.80      0.69      0.65        56
weighted avg       0.81      0.68      0.65        56



## Predicting genre for two arbitrary songs

In [1506]:
data =  pd.read_csv("songs.csv")
data.head()

Unnamed: 0,genre,lyrics,SongInfo
0,Rock,Old yellow bricks Love's a risk Quite the litt...,Arctic Monkeys - Old Yellow Bricks Lyrics
1,Hip-Hop,"… May I have your attention, please? May I hav...",Eminem – The Real Slim Shady Lyrics


In [1507]:
columns = data[["genre", "lyrics"]]

# normalised = columns["lyrics"].str.lower()
# ^^^ do not remove punctuation
normalised = columns["lyrics"].str.replace('[{}]'.format(string.punctuation), '', regex=True).str.lower()
# ^^^ remove punctuation

columns["normalised"] = normalised

columns["tokened"] = columns.apply(lambda row: nltk.word_tokenize(row['normalised']), axis=1)

noiseless = columns["tokened"].apply(lambda x: [item for item in x if item not in noise])
noiseless_col = [", ".join(w) for w in noiseless]
columns["noiseless"] = noiseless_col

lemmatised = columns["noiseless"].apply(lambda x: [lemmatiser.lemmatize(x)])
lemmatised_col = [", ".join(w) for w in lemmatised]
columns["lemmatised"] = lemmatised_col

columns

Unnamed: 0,genre,lyrics,normalised,tokened,noiseless,lemmatised
0,Rock,Old yellow bricks Love's a risk Quite the litt...,old yellow bricks loves a risk quite the littl...,"[old, yellow, bricks, loves, a, risk, quite, t...","old, yellow, bricks, loves, risk, quite, littl...","old, yellow, bricks, loves, risk, quite, littl..."
1,Hip-Hop,"… May I have your attention, please? May I hav...",… may i have your attention please may i have ...,"[…, may, i, have, your, attention, please, may...","…, may, attention, please, may, attention, ple...","…, may, attention, please, may, attention, ple..."


In [1508]:
x_test = columns[["lemmatised"]].squeeze()
y_test = columns[["genre"]].squeeze()

In [1509]:
vectorised_x_test = vectoriser.transform(x_test)

In [1510]:
clf.predict(vectorised_x_test)

pred = clf.predict(vectorised_x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

     Hip-Hop       1.00      1.00      1.00         1
        Rock       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



## Distinguishing David Bowie from Paul McCartney

In [1511]:
data =  pd.read_csv("bowie-from-mccartney.csv")
data.head()

Unnamed: 0,cantorId,cantorNome,musicaNome,letra
0,0,david-bowie,Heroes,"I, I will be king. And you, you will be queen...."
1,0,david-bowie,Starman,"Didn't know what time it was,. The lights were..."
2,0,david-bowie,Space Oddity,Ground control to Major Tom. Ground control to...
3,0,david-bowie,Life On Mars?,It's a god-awful small affair. To the girl wit...
4,0,david-bowie,Modern Love,I know when to go out. And when to stay in. Ge...


In [1512]:
columns = data[["cantorNome", "letra"]]

columns = columns[(columns["cantorNome"] == "david-bowie") | (columns["cantorNome"] == "paul-mccartney")]

# normalised = columns["letra"].str.lower()
# ^^^ do not remove punctuation
normalised = columns["letra"].str.replace('[{}]'.format(string.punctuation), '', regex=True).str.lower()
# ^^^ remove punctuation

columns["normalised"] = normalised

columns["tokened"] = columns.apply(lambda row: nltk.word_tokenize(row['normalised']), axis=1)

noiseless = columns["tokened"].apply(lambda x: [item for item in x if item not in noise])
noiseless_col = [", ".join(w) for w in noiseless]
columns["noiseless"] = noiseless_col

lemmatised = columns["noiseless"].apply(lambda x: [lemmatiser.lemmatize(x)])
lemmatised_col = [", ".join(w) for w in lemmatised]
columns["lemmatised"] = lemmatised_col

columns

Unnamed: 0,cantorNome,letra,normalised,tokened,noiseless,lemmatised
0,david-bowie,"I, I will be king. And you, you will be queen....",i i will be king and you you will be queen tho...,"[i, i, will, be, king, and, you, you, will, be...","king, queen, though, nothing, drive, away, bea...","king, queen, though, nothing, drive, away, bea..."
1,david-bowie,"Didn't know what time it was,. The lights were...",didnt know what time it was the lights were lo...,"[didnt, know, what, time, it, was, the, lights...","didnt, know, time, lights, low, leaned, back, ...","didnt, know, time, lights, low, leaned, back, ..."
2,david-bowie,Ground control to Major Tom. Ground control to...,ground control to major tom ground control to ...,"[ground, control, to, major, tom, ground, cont...","ground, control, major, tom, ground, control, ...","ground, control, major, tom, ground, control, ..."
3,david-bowie,It's a god-awful small affair. To the girl wit...,its a godawful small affair to the girl with t...,"[its, a, godawful, small, affair, to, the, gir...","godawful, small, affair, girl, mousy, hair, mu...","godawful, small, affair, girl, mousy, hair, mu..."
4,david-bowie,I know when to go out. And when to stay in. Ge...,i know when to go out and when to stay in get ...,"[i, know, when, to, go, out, and, when, to, st...","know, go, stay, get, things, done, catch, pape...","know, go, stay, get, things, done, catch, pape..."
...,...,...,...,...,...,...
942,paul-mccartney,He's just a young boy looking for a way to fin...,hes just a young boy looking for a way to find...,"[hes, just, a, young, boy, looking, for, a, wa...","hes, young, boy, looking, way, find, love, isn...","hes, young, boy, looking, way, find, love, isn..."
943,paul-mccartney,How can I hope to reach your love. Help me to ...,how can i hope to reach your love help me to d...,"[how, can, i, hope, to, reach, your, love, hel...","hope, reach, love, help, discover, youre, thin...","hope, reach, love, help, discover, youre, thin..."
944,paul-mccartney,I like it. Please don't take my heart away. It...,i like it please dont take my heart away its h...,"[i, like, it, please, dont, take, my, heart, a...","like, please, dont, take, heart, away, happy, ...","like, please, dont, take, heart, away, happy, ..."
945,paul-mccartney,Yvonne is the one I´ve been counting on. She s...,yvonne is the one i´ve been counting on she sa...,"[yvonne, is, the, one, i´ve, been, counting, o...","yvonne, one, i´ve, counting, said, long, you´r...","yvonne, one, i´ve, counting, said, long, you´r..."


In [1513]:
x_train, x_test, y_train, y_test = train_test_split(columns.lemmatised, columns.cantorNome, train_size = 0.7)
columns.cantorNome.value_counts()

david-bowie       483
paul-mccartney    464
Name: cantorNome, dtype: int64

In [1514]:
vectorised_x_train = vectoriser.fit_transform(x_train)
clf.fit(vectorised_x_train, y_train)

In [1515]:
vectorised_x_test = vectoriser.transform(x_test)
clf.predict(vectorised_x_test)

array(['david-bowie', 'paul-mccartney', 'paul-mccartney',
       'paul-mccartney', 'paul-mccartney', 'david-bowie', 'david-bowie',
       'david-bowie', 'paul-mccartney', 'paul-mccartney',
       'paul-mccartney', 'david-bowie', 'paul-mccartney', 'david-bowie',
       'david-bowie', 'david-bowie', 'paul-mccartney', 'paul-mccartney',
       'paul-mccartney', 'paul-mccartney', 'paul-mccartney',
       'david-bowie', 'paul-mccartney', 'paul-mccartney', 'david-bowie',
       'paul-mccartney', 'david-bowie', 'david-bowie', 'paul-mccartney',
       'paul-mccartney', 'paul-mccartney', 'paul-mccartney',
       'david-bowie', 'david-bowie', 'david-bowie', 'david-bowie',
       'paul-mccartney', 'david-bowie', 'david-bowie', 'david-bowie',
       'david-bowie', 'paul-mccartney', 'david-bowie', 'paul-mccartney',
       'paul-mccartney', 'paul-mccartney', 'paul-mccartney',
       'paul-mccartney', 'david-bowie', 'david-bowie', 'david-bowie',
       'david-bowie', 'paul-mccartney', 'david-bowie', '

In [1516]:
pred = clf.predict(vectorised_x_test)
print(classification_report(y_test, pred))

                precision    recall  f1-score   support

   david-bowie       0.74      0.69      0.72       139
paul-mccartney       0.72      0.77      0.75       146

      accuracy                           0.73       285
     macro avg       0.73      0.73      0.73       285
  weighted avg       0.73      0.73      0.73       285

