In [1]:
#Import all necessary packages
import numpy as np
import pandas as pd 
import re
from sklearn.feature_extraction import _stop_words as stop_words 
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


# Week 2.2 Word Importance and Topic Modelling

### TF/IDF

Up until this point, we've seen how counting words, and looking at the most frequent can gives us some insight into a single document. If we want to start comparing documents with more certainty, or getting smarter about our representations, we can try and get a set of numbers for each documents that not only represents **word frequency**, but also **word importance**. 

What we will end up with is a measurement called **TF/IDF** or **T**erm **F**requency x **I**nverse **D**ocument **F**requency. 

### TF

**TF** stands for **term frequency** and we've been using it a lot already in our Bags of Words. By itself, it tells us how many times a particular term appears in a document. Can we do better?


In [2]:
import re
fs = open('data/hacking.txt', 'r') 
book = fs.read()

In [74]:
#Called once for each document
#Every time you update this function you need to run the cell again (Shift + Enter)
def my_tokeniser(doc):
    #Split on spaces
    tokens = re.split(r'[-\s.,;!?]+', doc)
    return [lem.lemmatize(t.lower()) for t in tokens if not t in stop_words.ENGLISH_STOP_WORDS]

In [75]:
#Using the CountVectorizer to get a bag of words using a custom tokeniser
count_vectoriser = CountVectorizer(tokenizer=my_tokeniser)
bag_of_words = count_vectoriser.fit_transform([book])
print(bag_of_words.todense().shape)

(1, 11770)


In [76]:
vocab = count_vectoriser.get_feature_names_out()
bag_of_words_df = pd.DataFrame(bag_of_words.todense(), columns = vocab)

In [77]:
#Topic 20 most common words (we've already removed STOP WORDS)
bag_of_words_df.iloc[0].sort_values()[-20:]

machine      249
people       277
phone        289
network      297
worm         298
hacking      301
day          303
mendax       307
police       323
anthrax      338
phoenix      361
just         361
like         367
didn't       378
electron     383
time         439
par          501
hacker       708
'            823
computer    1020
Name: 0, dtype: int64

### TF Seems OK?

Consider our book and some of its most common words

- computer 
- hacking
- security 
- police
- network

### Normalised Term Frequency

These words seem to represent key topics of the book quite well. However, what about **mother**? This appears 113  times across the book, out of a vocabulary of  approx. 13,000 words. Compare this to a WhatsApp conversation that me and my sister had about our family Christmas that has the word **mother** 5 times in with a vocabulary of about 50 words. When we compare just **term frequency**, it seems like the hacking book is far, far more (~20 times) about mothers than this text message chain. But thats not really the case. 

We use **normalised term frequency** to account for this, where the length of the document is used alongside the count to adjust for this.

In [78]:
bag_of_words_df["mother"][0]

113

In [79]:
#Divide term frequency by total number of unique words (vocab size)
book_tf = bag_of_words_df["mother"][0] / len(vocab)
text_msg_tf = 5.0 / 50
#Much bigger normalised term frequency for text msgs
print(book_tf, text_msg_tf)

0.009600679694137639 0.1


### IDF

**IDF** stands for **I**nverse **D**ocument **F**requency and it tells us how important a word is in a particular document in comparison to the rest of the corpus. Up until this point we've been considering the book as one big document, but now we're going to take each chapter on its own, to see if we can see if we can highlight differences between them.

We can see below that most chapters have the terms **computer** and **hacker** featuring pretty heavily. 

The **IDF** is the ratio of all documents in comparison to how many documents the term appears in. 

It tells us how surprising is it that this word appeared here, given what we know about all the documents. 






## Getting the Vector for each document (chapter)

First, we use a **regex** to split it into chapters, as there is a recognisable formatting to this. This means our corpus is the whole novel, with each chapter considered a new document and we store the whole thing as a 1D array. Each item in the array is a string containing a chapter's worth of text.

### Examining the highest TF values

Looking at the Term Frequency (bag of words) for each chapter shows that each chapter has quite similar high frequency words like ``computer`` and ``hacker``. This isn't particularly useful if we want a representation that highlights the important terms **to that chapter**.

In [82]:
chapters = re.split(r'\s\s\s\s\s\sChapter+', book)

In [83]:
count_vectoriser = CountVectorizer(tokenizer=my_tokeniser)
bag_of_words = count_vectoriser.fit_transform(chapters)
print(bag_of_words.todense().shape)

(11, 11770)


In [84]:
bag_of_words_df = pd.DataFrame(bag_of_words.todense(), columns = vocab)
for i in range(len(bag_of_words_df)):
    print("chapter", i)
    print(bag_of_words_df.iloc[i].sort_values(ascending = False).head(10))

chapter 0
book           38
underground    25
hacker         23
computer       21
mountain       19
black          18
par            18
like           13
suelette       13
new            12
Name: 0, dtype: int64
chapter 1
computer    288
worm        261
nasa        114
account     102
hacker       99
span         95
network      86
people       82
time         74
like         70
Name: 1, dtype: int64
chapter 2
par         180
network      65
hacker       64
force        61
computer     55
alto         45
like         41
theorem      40
time         39
didn't       38
Name: 2, dtype: int64
chapter 3
par         261
theorem      48
'            45
didn't       37
agent        37
room         36
secret       36
computer     36
hacker       33
service      33
Name: 3, dtype: int64
chapter 4
'           226
phoenix     196
electron    178
computer    140
hacker       88
machine      88
zardoz       74
deszip       61
just         56
file         56
Name: 4, dtype: int64
chapter 5
electron  

### Examining the highest TF/IDF values

Now we want to see which words are important to each chapter. Interestingly we've lost all of the words like `computer` and `hacking`, because they're surprising or indicative of that chapter, given the whole corpus. These words are the words that tell us the most about each chapter.

It seems likes names (of people and of viruses?) are important distinctions between chapters. 

We also did lemmatisation instead of stemming and often have the same word, and its possesive version in a chapter (`anthrax` and `anthrax's`). Maybe stemming would be better?

In [86]:
#Using the TFIDF Vectorizer to get TFIDF vectors with custom tokeniser
tfidf_vectoriser = TfidfVectorizer(tokenizer=my_tokeniser)
tfidf = tfidf_vectoriser.fit_transform(chapters)
print(tfidf.todense().shape)

(11, 11770)


In [87]:
tfidf_df = pd.DataFrame(tfidf.todense(), columns = vocab)
for i in range(len(tfidf_df)):
    print("chapter", i)
    print(tfidf_df.iloc[i].sort_values(ascending = False).head(10))

chapter 0
book             0.259165
mountain         0.250175
suelette         0.194637
julian           0.175161
par              0.158727
underground      0.156855
dreyfus          0.149721
`underground'    0.149721
assange          0.149721
hacker           0.144307
Name: 0, dtype: float64
chapter 1
worm        0.455574
computer    0.357677
span        0.247602
nasa        0.198986
mcmahon     0.149069
account     0.126677
w**k        0.126167
bowen       0.125104
hacker      0.122951
network     0.106806
Name: 1, dtype: float64
chapter 2
par         0.630041
theorem     0.168667
force       0.165135
network     0.161879
hacker      0.159388
alto        0.144310
computer    0.136974
citibank    0.135452
defcon      0.124801
machine     0.102871
Name: 2, dtype: float64
chapter 3
par         0.772681
theorem     0.171189
kentucky    0.129372
motel       0.128195
'           0.094788
agent       0.092146
par's       0.087527
nibbler     0.085450
room        0.082428
didn't      0.07793

### Maths With Word Vectors
So what we have now is a **vector** for each document (in our case, each document is a chapter). This vector represents something about the text in that chapter based on the frequencies that words occur, and how that relates to the corpus as a whole. 

We can use these vectors calculate how similar two documents by calculating the distance between them. Our vectors are currently >10,000. This means this is the _dimensionality_ of our vector. We can actually use similar maths that we would use to work out the distance between 2 points in 2 dimensional space. And its much easier to visualise how that works!

Two methods often used are **Manhattan distance** and **Euclidean** distance, but we tend to use something else for TF/IDF vectors.

### Cosine Distance

What we actually want to use is something called the **cosine distance**, which essentially tells how much the two vectors are pointing in the same direction. The results go from -1 to 1, where 1 is exactly the same, 0 is nothing in common and -1 is **anti-similar**. However, this never happens for TFIDF vectors, because word counts can never be negative!

### Similarity  

What we already see is that we can begin to group documents together by how similar they are. Later in the class we  will teach you some more advanced methods for taking this idea further. 

Interestingly, the first chapter seems to be the most different from the rest, and I think that isn't a Chapter per se, but the preface. Also, consecutive chapters tend to be the most simliar to each other. 

In [88]:
#Import the cosine similarity method from sklearn
from sklearn.metrics.pairwise import cosine_similarity as cosine
result = cosine(tfidf_df)
#Put the result in a dataframe and 
df = pd.DataFrame(result)
#Show with heatmap style gradients
df.style.background_gradient(cmap='Greens')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.256828,0.295226,0.289855,0.18923,0.183745,0.209863,0.176466,0.16495,0.147333,0.302989
1,0.256828,1.0,0.308646,0.195964,0.368159,0.302507,0.328194,0.382442,0.260568,0.238328,0.334303
2,0.295226,0.308646,1.0,0.739294,0.38292,0.316235,0.275956,0.274283,0.21575,0.206762,0.268272
3,0.289855,0.195964,0.739294,1.0,0.282055,0.243258,0.221948,0.161455,0.17828,0.138218,0.20255
4,0.18923,0.368159,0.38292,0.282055,1.0,0.720724,0.412985,0.302526,0.261026,0.216051,0.352627
5,0.183745,0.302507,0.316235,0.243258,0.720724,1.0,0.445504,0.237414,0.314378,0.203667,0.355287
6,0.209863,0.328194,0.275956,0.221948,0.412985,0.445504,1.0,0.264994,0.346665,0.178332,0.342815
7,0.176466,0.382442,0.274283,0.161455,0.302526,0.237414,0.264994,1.0,0.635501,0.225833,0.283895
8,0.16495,0.260568,0.21575,0.17828,0.261026,0.314378,0.346665,0.635501,1.0,0.160936,0.345479
9,0.147333,0.238328,0.206762,0.138218,0.216051,0.203667,0.178332,0.225833,0.160936,1.0,0.580587


## Using TF/IDF to do a search

Here we have a collection of **104** Tom Waits lyrics (https://www.kaggle.com/datasets/albertsuarez/azlyrics)

We're going to show how you can use **TF/IDF** similarity to search for songs within it! As each TFIDF vector tells us something about the tokens present, and their importance to each song in relation to the wider catalogue, this will be a better search than simply matching exact strings.

### A Taylor Swift Aside 

Here we have a collection of **478** Taylor Swift lyrics (https://www.kaggle.com/datasets/deepshah16/song-lyrics-dataset

Firstly, there is no way there are 478 Taylor Swift songs, most be some duplication. 

Also, although its given a song lyric dataset, whats actually in there?

In [4]:
taylor_swift = pd.read_csv("data/TaylorSwift.csv")
taylor_swift = taylor_swift[taylor_swift.notna()]

In [5]:
songs = taylor_swift[["Title","Lyric"]]

In [6]:
#Most are songs
songs.iloc[0]["Lyric"]

"vintage tee brand new phone high heels on cobblestones when you are young they assume you know\u2005nothing sequin\u2005smile black lipstick sensual\u2005politics when you are young they assume\u2005you know nothing   but i knew you dancin' in your levi's drunk under a streetlight i i knew you hand under my sweatshirt baby kiss it better i  refrain and when i felt like i was an old cardigan under someone's bed you put me on and said i was your favorite   a friend to all is a friend to none chase two girls lose the one when you are young they assume you know nothing   but i knew you playing hideandseek and giving me your weekends i i knew you your heartbeat on the high line once in twenty lifetimes i  refrain and when i felt like i was an old cardigan under someone's bed you put me on and said i was your favorite   to kiss in cars and downtown bars was all we needed you drew stars around my scars but now i'm bleedin'   'cause i knew you steppin' on the last train marked me like a blood

In [7]:
#Some are speeches
songs.iloc[439]["Lyric"]

'im writing this post about the upcoming midterm elections on november 6th in which ill be voting in the state of tennessee in the past ive been reluctant to publicly voice my political opinions but due to several events in my life and in the world in the past two years i feel very differently about that now i always have and always will cast my vote based on which candidate will protect and fight for the human rights i believe we all deserve in this country i believe in the fight for lgbtq rights and that any form of discrimination based on sexual orientation or gender is wrong i believe that the systemic racism we still see in this country towards people of color is terrifying sickening and prevalent  i cannot vote for someone who will not be willing to fight for dignity for all americans no matter their skin color gender or who they love running for senate in the state of tennessee is a woman named marsha blackburn as much as i have in the past and would like to continue voting for 

In [8]:
#Some are not in english
songs.iloc[475]["Lyric"]

'zwrotka  siedzę i patrzę jak czytasz z głową pochyloną budzę się i patrzę jakl oddychasz z zamkniętymi oczyma siedzę i oglądam ciebie zauważam wszystko co robisz i czego nie robisz jesteś tyle starszy i mądrzejszy i ja  refren  czekam przy drzwiach jak małe dziecko używam najlepsze farby na twój portret nakrywam stół wykwintnymi pierdołami i patrzę jak ty to jedynie znosisz jeśli to wszystko dzieje się w mojej głowie to powiedz mi teraz powiedz mi że mylę się jakoś wiem że moja miłość powinna być celebrowana a ty ją jedynie znosisz  zwrotka  witam cię jak zwycięzcę powracającego z pola walki pobłażliwie traktuję twoje niedyskrecje siedzę i słucham poleruję talеrze aż lśnią i błyszczą jesteś tylе starszy i mądrzejszy i ja  refren  czekam przy drzwiach jak małe dziecko używam najlepsze farby na twój portret nakrywam stół wykwintnymi pierdołami i patrzę jak ty to jedynie znosisz jeśli to wszystko dzieje się w mojej głowie to powiedz mi teraz powiedz mi że mylę się jakoś wiem że moja miło

In [9]:
#Some are lists of tour dates
songs.iloc[354]["Lyric"]

'north america leg one5808  glendale az 508  santa clara ca 508  santa clara ca 5808  pasadena ca 5908  pasadena ca 508  seattle wa 5508  denver co 608  chicago il 608  chicago il europe6808  manchester england 6908  manchester england 6508  dublin ireland 6608  dublin ireland 608  london england 608  london england north america leg two6008  louisville ky 7708  columbus oh 7008  landover md 708  landover md 708  philadelphia pa 7408  philadelphia pa 7708  cleveland oh 7008  east rutherford nj 708  east rutherford nj 708  east rutherford nj 7608  foxborough ma 7708  foxborough ma 7808  foxborough ma 808  toronto canada 8408  toronto canada 8708  pittsburgh pa 8008  atlanta ga 808  atlanta ga 8408  tampa fl 8808  miami gardens fl 8508  nashville tn 8808  detroit mi 808  minneapolis mn 908  minneapolis mn 9808  kansas city mi 9508  indianapolis in 9808  st louis mi 908  new orleans la 9908  houston tx 0508  arlington tx 0608  arlington tx oceania0908  perth australia 0608  melbourne aust

In [11]:
#Load in Tom Waits
tom_waits = pd.read_csv("data/tom_waits.tsv", delimiter="\t")
songs = tom_waits[["SONG_NAME","LYRICS"]]
songs.columns = ["Title","Lyric"]

In [12]:
songs.sample(10)

Unnamed: 0,Title,Lyric
49,that feel,"well there's one thing you can't lose, it's th..."
94,kiss me,"the fire's dying out, all the embers have been..."
9,little trip to heaven (on the wings of your love),"lazy trip to heaven on the wings of your love,..."
62,big in japan,"i got the style but not the grace, i got the c..."
91,what keeps mankind alive,"you gentlemen who think you have a mission, to..."
54,nobody,"nobody, nobody, will ever love you the way i c..."
14,kentuchy avenue,eddie graces buick got 4 bullet holes in the s...
36,frank's song,"that woman will take you, that woman will brea..."
17,heartattack and vine,"liar liar with your pants on fire, white spade..."
80,lost in the harbour,"over here the ladies all want sweet perfume, b..."


### The Search

1. Get TFIDF vectors for all your documents (songs)


2. Use **the same** process to get a TFIDF vector for your query 


3. Calculate cosine similarity 


4. Return the nearest match(es)


So now we can find songs that have **similar words** which are important in **similar ways**

In [13]:
#Use custom tokeniser
tfidf_vectoriser = TfidfVectorizer(tokenizer=my_tokeniser)

NameError: name 'my_tokeniser' is not defined

In [14]:
#Get TFIDF
tfidf = tfidf_vectoriser.fit_transform(songs["Lyric"])
#Save list of unique tokens (vocab) for later
vocab = tfidf_vectoriser.get_feature_names()
print(tfidf.todense().shape)

NameError: name 'tfidf_vectoriser' is not defined

### Most important terms for each song

Interesting to see which songs the most important tokens are also the title. Not always the case!

In [101]:
tfidf_df = pd.DataFrame(tfidf.todense(), columns = vocab)
for i in range(len(tfidf_df)):
    print(songs.iloc[i]["Title"])
    print(tfidf_df.iloc[i].sort_values(ascending = False).head(10))

ol' 55
truck      0.335463
freeway    0.292749
car        0.268395
feeling    0.223642
riding     0.215443
sun's      0.197834
went       0.194062
luck       0.185340
lady       0.185340
coming     0.161037
Name: 0, dtype: float64
i hope that i don't fall in love with you
hope      0.406561
fall      0.310512
love      0.303618
don't     0.267786
look      0.226836
turn      0.186307
chair     0.171597
just      0.133893
think     0.127423
you're    0.108946
Name: 1, dtype: float64
virginia avenue
catching    0.547334
walking     0.273667
tell        0.256617
i'm         0.194285
avenue      0.182445
let         0.176979
closing     0.170923
dreaming    0.143162
got         0.124384
they're     0.123717
Name: 2, dtype: float64
old shoes (& picture postcards)
kiss        0.324804
dear        0.275953
farewell    0.275953
anymore     0.275953
bind        0.275953
gone        0.246205
call        0.245008
goodbye     0.245008
eye         0.237111
i'll        0.235342
Name: 3, dtype: float

### Conducting a query

In [105]:
#Get the TFIDF vector for your query
query = tfidf_vectoriser.transform(["a drunk man beer wine whiskey"])

In [106]:
#Get similarity between query and song catalogue
similarity = cosine(query.todense(), tfidf.todense())[0]



In [107]:
#Get the closest 5 songs
closest = np.argsort(similarity, )[-5:]
songs.iloc[closest].values

array([["i hope that i don't fall in love with you",
        "well i hope that i don't fall in love with you, 'cause falling in love just makes me blue, well the music plays and you display your heart for me to see, i had a beer and now i hear you calling out for me, and i hope that i don't fall in love with you. well the room is crowded, people everywhere, and i wonder, should i offer you a chair?, well if you sit down with this old clown, take that frown and break it, before the evening's gone away, i think that we could make it, and i hope that i don't fall in love with you. well the night does funny things inside a man, these old tom-cat feelings you don't understand, well i turn around to look at you, you light a cigarette, i wish i had the guts to bum one, but we've never met, and i hope that i don't fall in love with you. i can see that you are lonesome just like me, and it being late, you'd like some some company, well i turn around to look at you, and you look back at me, the 

### LSA

Texts the say the same things, but with different words, will have completely different TFIDF vectors. We can do better!

Topic Modelling approaches attempt to improve this can grouping things together based on similar semantic meaning, not just frequency of terms. 

SVD will group together terms that occur frequently together in the same documents

The first thing we need to do is subtract the mean of each tfidf column from each value (sometimes called "whitening")

In [29]:
from sklearn.decomposition import TruncatedSVD

In [16]:
lyrics = pd.read_csv("data/lyric_data.tsv", delimiter="\t")
songs = lyrics[["ARTIST_NAME-SONG_NAME","LYRICS"]]
songs.columns = ["Title","Lyric"]

In [109]:
lyrics.columns

Index(['ARTIST_NAME-SONG_NAME', 'ARTIST_URL', 'SONG_NAME', 'SONG_URL',
       'LYRICS'],
      dtype='object')

In [111]:
#Get TFIDF
tfidf = tfidf_vectoriser.fit_transform(songs["Lyric"])
#Save list of unique tokens (vocab) for later
vocab = tfidf_vectoriser.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf.todense(), columns = vocab)
print(tfidf.todense().shape)

(1636, 12319)


In [112]:
#Subtract mean
tfidf_df = tfidf_df - tfidf_df.mean()

In [114]:
#How many topics?
num_topics = 16
pd.options.display.max_columns=num_topics
labels = ['topic{}'.format(i) for i in range(num_topics)]

In [115]:
#Calculate topics
svd = TruncatedSVD(n_components = num_topics, n_iter = 100) 
svd_topic_vectors = svd.fit_transform(tfidf_df.values)

Now we have 16 values for each song (instead of 1000s!)

But there's more!

We can look at the weights LSA has assigned to each word within each topic. `svd.components_` is a variable that gives us these weightings:

In [117]:
#How much does each topic apply to each token?
topic_weights = pd.DataFrame(svd.components_.T, index=vocab, columns=labels)
topic_weights.sample(20) #display it

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
wailing,-0.001008,-0.000168,0.000279,-0.0009846614,1.7e-05,-0.000893,0.000861,-0.000575,0.000353,-0.00017,0.000469,-0.00098,0.000424,-6.2e-05,0.00028,0.000573
savior,3.7e-05,-9.8e-05,-0.000617,-0.000523475,-2.7e-05,0.000926,-0.000692,-0.001461,-0.000917,-0.000691,-0.000992,-0.000301,0.00134,-0.000456,0.000528,0.000446
split,0.000526,0.007128,-0.003817,0.0008617967,0.001127,-0.004547,0.002331,0.00014,-0.004852,-0.001048,-0.002901,-0.004172,-0.002121,0.009345,-0.005379,0.00074
media's,0.000344,0.001593,0.000117,0.0005331844,0.001602,0.000252,0.000342,0.001136,0.002887,0.000629,-0.000152,-0.000334,0.00019,0.000465,0.000677,-0.000409
carbon,-0.000308,7.2e-05,-0.00124,0.001182744,0.000618,-0.000115,0.000162,-0.000779,0.00049,-0.000365,-0.001258,0.00018,0.00108,7e-05,-0.000781,0.000929
flower's,-0.002208,0.002256,0.001929,-0.00197136,-0.000399,-0.000486,-0.000207,0.001419,0.00112,0.000195,0.000186,0.000882,-0.001624,-0.00059,1.9e-05,-0.003147
v,-0.000515,-0.000431,-0.000808,-0.0008686411,-0.000636,0.001153,-0.001213,0.00061,8.4e-05,-0.001338,-0.001595,-0.001015,-7.1e-05,-0.00032,-0.001224,-0.00107
platform,-0.000419,0.000295,-0.001237,0.000414029,-0.000129,-0.0008,0.000605,0.000127,0.000859,-0.000359,-6.3e-05,0.000374,0.000531,0.000332,-0.001552,0.000827
chemistry,-8e-05,-0.000506,-7e-05,-0.001105099,-0.000395,0.000536,0.00013,4.5e-05,0.000175,-0.000415,-0.00068,-0.000987,-0.001227,0.001584,-0.000146,-0.001033
grey,-0.006863,0.006509,-0.002223,-0.001534188,-0.006659,-0.010944,-0.000254,-0.003208,-0.00422,-0.001393,-0.003155,0.008333,-0.0051,0.00166,-0.003012,0.003724


In [118]:
#How much does each topic apply to each song?
svd_topic_vectors_df = pd.DataFrame(svd_topic_vectors, index=songs["Title"].values, columns=labels)
svd_topic_vectors_df.sample(10)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
britney spears-plastic,0.009118,-0.071823,-0.084237,-0.044189,-0.078905,0.061279,0.102349,-0.012192,0.064009,-0.0639,-0.04226,0.017785,0.059598,-0.008154,-0.005247,0.080116
"waits, tom-blue valentines",-0.005124,0.0066,-0.03351,0.006746,0.064683,-0.062322,0.006534,-0.026172,-0.019424,0.031939,-0.046856,-0.048682,-0.011899,0.043372,-0.011097,-0.012831
elton john-i don't care,0.094584,0.01763,-0.114931,-0.077113,0.006914,0.211456,-0.041294,-0.232806,0.118562,-0.095481,-0.247927,-0.030415,0.08451,-0.01443,-0.128116,0.151656
beach boys-the little old lady from pasadena,-0.092713,-0.000648,-0.004298,-0.000978,0.063456,0.019759,-0.051334,-0.055528,-0.057713,-0.054845,0.044815,-0.047933,0.047284,0.015405,0.02419,-0.04597
"waits, tom-she stole the blush",-0.059362,0.016893,-0.003358,-0.027765,-0.044169,-0.00788,0.012088,0.034496,0.006408,-0.011582,-0.038955,-0.03195,-0.044013,0.048858,0.004998,-0.029453
aerosmith-love in an elevator,0.087333,0.131247,0.034533,0.030594,0.138934,0.073553,0.013081,0.060507,0.13239,-0.011199,-0.026513,-0.035667,0.007902,0.018312,0.020383,-0.009398
aerosmith-deuces are wild,0.152323,0.122794,0.025325,-0.035696,0.118066,0.053187,0.024164,-0.002985,-0.039618,-0.068937,0.065999,-0.079658,0.032014,0.127859,-0.08687,0.022457
depeche mode-a pain that i'm used to,0.094769,0.012326,-0.205065,0.088134,-0.00894,-0.023979,0.02243,-0.013775,0.012572,0.117372,0.012976,-0.055514,-0.046325,0.010678,0.10823,-0.004679
eddie vedder-goodbye,0.054795,0.108312,-0.09828,0.037851,-0.055692,-0.054634,-0.016587,-0.024897,-0.008595,0.01889,-0.024555,0.062336,-0.037709,0.043814,-0.013997,-0.011322
spice girls-love thing,0.359084,0.06098,-0.20373,0.013284,-0.032231,0.198995,-0.060752,0.02515,0.086975,0.006633,-0.044807,-0.056513,-0.076145,0.015504,0.048947,0.008072


In [119]:
#Most relevant words for each topic
num_terms = 20
for i in range(num_topics):
    print("___topic " + str(i) + "___")
    topicName = "topic" + str(i)
    weightedlist = topic_weights.get(topicName).sort_values()[-num_terms:]
    print(weightedlist.index.values)

___topic 0___
["can't" "i'll" 'feel' 'make' "it's" 'wanna' 'say' 'need' 'tell' 'let'
 'just' "you're" 'want' 'yeah' "i'm" 'oh' 'know' "don't" 'baby' 'love']
___topic 1___
['moon' 'hope' 'away' 'say' 'star' 'summer' 'real' 'tell' 'true' 'life'
 'gone' 'world' "i've" 'eye' 'time' 'heart' 'fall' 'day' "it's" 'love']
___topic 2___
['mm' 'thank' 'bye' 'mmm' "darlin'" 'ooo' 'ho' 'ha' 'ah' 'sweet' 'la'
 'ooh' 'whoa' 'oo' 'girl' 'hey' 'baby' 'yeah' 'love' 'oh']
___topic 3___
['tell' 'long' 'hard' 'like' 'home' 'standing' 'good' 'heart' 'think'
 'alright' 'gone' "i've" "she's" 'time' 'gonna' "it's" 'yeah' 'know' "i'm"
 'oh']
___topic 4___
['old' 'ready' 'pretty' 'honey' "that's" "you're" 'bad' 'blue' 'little'
 'hey' "she's" "ain't" 'love' 'good' 'yeah' 'got' 'gonna' 'girl' 'baby'
 "i'm"]
___topic 5___
['woman' 'round' 'really' 'talk' 'know' "ain't" 'dance' 'gotta' 'gonna'
 'love' "she's" 'ah' 'boy' 'girl' 'got' 'la' 'wanna' 'yeah' 'hey' "don't"]
___topic 6___
["we're" 'world' 'win' 'home' 'come

Makes me think that "yeah" should be a stop word for song lyrics?

In [120]:
#Query some strings and see how they relate to topics
df = topic_weights.T["ooh whoa christmas".split()]
df.style.background_gradient(cmap='Greens')

Unnamed: 0,ooh,whoa,christmas
topic0,0.062398,0.016103,-0.028641
topic1,-0.081349,0.01296,0.021802
topic2,0.051036,0.052496,0.011664
topic3,-0.014966,0.023576,-0.005431
topic4,-0.013499,-0.02198,-0.009351
topic5,-0.017662,0.013119,-0.047153
topic6,0.009704,0.004545,-0.010108
topic7,-0.031302,0.015441,0.041362
topic8,0.036416,0.00831,-0.008629
topic9,0.039404,0.008226,-0.023128


## LDiA

In [41]:
from sklearn.decomposition import LatentDirichletAllocation

In [121]:
#We calculate LDA on the Bag Of Words, NOT TFIDF
count_vectoriser = CountVectorizer(tokenizer=my_tokeniser)
bag_of_words = count_vectoriser.fit_transform(songs["Lyric"])
vocab = count_vectoriser.get_feature_names_out()

print(bag_of_words.todense().shape)

(1636, 12319)


In [132]:
lda = LatentDirichletAllocation(n_components=num_topics,
                                random_state=123,
                                learning_method='batch')

In [133]:
#May take some time depending on size of dataset!
lda_topics = lda.fit_transform(bag_of_words)

In [134]:
#How much does each topic apply to each song?
lda_topic_vectors_df = pd.DataFrame(lda_topics, index=songs["Title"].values, columns=labels)
lda_topic_vectors_df.sample(10)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
"cohen, leonard-teachers",0.000492,0.000492,0.000492,0.000492,0.000492,0.000492,0.000492,0.62386,0.000492,0.36925,0.000492,0.000492,0.000492,0.000492,0.000492,0.000492
"mitchell, joni-woodstock",0.000702,0.000702,0.000702,0.000702,0.000702,0.000702,0.000702,0.924087,0.000702,0.000702,0.000702,0.000702,0.066081,0.000702,0.000702,0.000702
elton john-dirty little girl,0.000772,0.000772,0.000772,0.204296,0.051012,0.000772,0.503489,0.000772,0.000772,0.000772,0.000772,0.000772,0.000772,0.000772,0.000772,0.231944
aerosmith-no more no more,0.000619,0.000619,0.000619,0.000619,0.000619,0.199438,0.262609,0.000619,0.100388,0.000619,0.045572,0.000619,0.337433,0.000619,0.000619,0.048372
beach boys-no-go showboat,0.000687,0.000687,0.000687,0.000687,0.000687,0.000687,0.000687,0.000687,0.818213,0.000687,0.000687,0.172171,0.000687,0.000687,0.000687,0.000687
kaiser chiefs-misery company,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.000326,0.995117,0.000326,0.000326,0.000326
"beatles, the-medley: kansas city / hey, hey, hey, hey",0.367886,0.03696,0.000573,0.453877,0.134396,0.000573,0.000573,0.000573,0.000573,0.000573,0.000573,0.000573,0.000573,0.000573,0.000573,0.000573
bob dylan-do right to me baby (do unto others),0.000391,0.000391,0.000391,0.356762,0.000391,0.000391,0.035222,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.000391,0.602938
dusty springfield-here she comes,0.12517,0.000772,0.000772,0.000772,0.000772,0.000772,0.000772,0.000772,0.08373,0.000772,0.000772,0.000772,0.000772,0.000772,0.030139,0.751701
elton john-hey armadillo,0.000613,0.000613,0.000613,0.000613,0.110814,0.000613,0.000613,0.000613,0.680158,0.000613,0.000613,0.000613,0.000613,0.000613,0.000613,0.201063


### Comparing distributions of SVD and LDA

How are the distributions of topic scores different for SVD and LSA? Compare the results for the same song

In [135]:
song_name = 'portishead-mysterons'

In [136]:
svd_topic_vectors_df.loc[song_name]

topic0    -0.033149
topic1    -0.086843
topic2    -0.019210
topic3    -0.144541
topic4    -0.010099
topic5     0.040664
topic6    -0.185396
topic7     0.258922
topic8    -0.105410
topic9    -0.066661
topic10   -0.092530
topic11    0.001251
topic12    0.058855
topic13   -0.166626
topic14    0.093101
topic15   -0.017662
Name: portishead-mysterons, dtype: float64

In [137]:
lda_topic_vectors_df.loc[song_name] 

topic0     0.001389
topic1     0.001389
topic2     0.001389
topic3     0.001389
topic4     0.001389
topic5     0.001389
topic6     0.001389
topic7     0.001389
topic8     0.001389
topic9     0.001389
topic10    0.001389
topic11    0.001389
topic12    0.001389
topic13    0.001389
topic14    0.001389
topic15    0.979167
Name: portishead-mysterons, dtype: float64

In [138]:
#Highest scoring songs for each topic
for topic in labels:
    print(topic)
    #Get last songs 10 in list (highest scoring for each topic)
    print(songs["Title"].values[lda_topic_vectors_df[topic].argsort().values[-10:]])

topic0
['elton john-queen of cities (el dorado ii)' 'britney spears-better'
 'dusty springfield-mockingbird' 'beach boys-help me, rhonda'
 'beatles, the-i am the walrus' 'waits, tom-diamonds on my windshield'
 'waits, tom-murder in the red barn' 'britney spears-liar'
 'waits, tom-hell broke luce'
 'britney spears-till the world ends (alex suarez club remix)']
topic1
['depeche mode-lilian' 'elton john-take me to the pilot'
 'manic street preachers-dead martyrs'
 'kaiser chiefs-dead or in serious trouble' 'bob dylan-paths of victory'
 'aerosmith-cheese cake' 'beach boys-hushabye' "beatles, the-searchin'"
 'aerosmith-lightning strikes' 'beach boys-pitter patter']
topic2
['manic street preachers-tsunami'
 'manic street preachers-another invented disease'
 'elton john-my quicksand' 'waits, tom-in the colosseum'
 'elton john-the new fever waltz' 'dusty springfield-heartbeat'
 'bruce springsteen-breakaway' 'beach boys-palisades park'
 'basement jaxx-get me off' 'de la soul-property of spitkic

In [129]:
#Most relevant tokens for each topic
for i, topic in enumerate(lda.components_):
    print("topic " + str(i) + ":")
    #Get last n tokens (highest values)
    print(vocab[topic.argsort()[-num_terms:]])

topic 0:
['just' 'said' "don't" 'wo' 'rhonda' "she's" 'way' 'dorado' 'el' "it's"
 'come' 'gonna' 'let' 'bye' 'home' "i'm" 'like' 'help' 'know' 'oh']
topic 1:
['fall' 'road' 'look' 'like' 'cuckoo' "i'm" 'gonna' 'dead' 'fun' 'just'
 'pitter' "what's" 'got' 'star' 'patter' 'light' 'oh' 'yeah' 'ooo' 'ah']
topic 2:
['waiting' 'come' "we've" 'night' 'cold' 'mind' "won't" 'got' 'yeah'
 'like' 'went' "i'm" 'just' 'body' 'need' 'oh' 'baby' 'kiss' 'heart'
 "it's"]
topic 3:
['like' 'woman' 'little' 'shake' 'bad' 'street' "it's" 'blue' 'right'
 'gonna' 'gotta' 'love' "you're" "ain't" 'come' 'good' "i'm" 'baby' 'got'
 'yeah']
topic 4:
['stop' 'bring' 'love' 'talk' 'better' 'day' 'know' "can't" 'boy' 'god'
 'time' 'little' 'come' 'yeah' 'oh' 'say' "don't" 'let' "i'm" 'hey']
topic 5:
['like' "it's" 'wind' 'tell' 'eye' 'look' 'lonely' "you'll" 'come' 'live'
 'time' 'day' 'run' 'world' 'life' 'just' "you're" 'away' 'know' "don't"]
topic 6:
["we're" 'thing' 'love' 'lost' 'number' 'just' 'play' 'time' "t