Implementing tokenization and count vectorization from scratch 

Implementing TF-IDF from scratch 

Using dimensionality reduction on vectorized text data to create and interpret visualization

In [1]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt', quiet=True)
np.random.seed(0)

In [8]:
filenames = [f'song{str(i)}.txt' for i in range(1,21)] 
filenames

['song1.txt',
 'song2.txt',
 'song3.txt',
 'song4.txt',
 'song5.txt',
 'song6.txt',
 'song7.txt',
 'song8.txt',
 'song9.txt',
 'song10.txt',
 'song11.txt',
 'song12.txt',
 'song13.txt',
 'song14.txt',
 'song15.txt',
 'song16.txt',
 'song17.txt',
 'song18.txt',
 'song19.txt',
 'song20.txt']

using list comprehension to create a list containing the name of every single song

In [3]:
with open('data/song18.txt') as f:
    test_song = f.readlines() 
test_song

['[Kendrick Lamar:]\n',
 "Two wrongs don't make us right away\n",
 "Tell me something's wrong\n",
 'Party all of our lives away\n',
 'To take you on\n',
 '[Zacari:]\n',
 'Oh, baby I want you\n',
 'Baby I need you\n',
 'I wanna see you\n',
 'Baby I wanna go out yeah\n',
 'Baby I wanna go out yeah\n',
 'Baby I want you\n',
 'Baby I need you\n',
 'I wanna see you\n',
 'Baby I wanna go out yeah\n',
 'Baby I wanna go out yeah\n',
 'All night (all night, all night)\n',
 'All night\n',
 "Your body's on fire\n",
 'And your drinks on ice\n',
 'All night (all night, all night)\n',
 'All night\n',
 "Your body's on fire\n",
 'And your drinks on ice\n',
 '[Babes Wodumo:]\n',
 'Oh my word oh my gosh oh my word (Oh my gosh)\n',
 'Oh my word oh my gosh oh my word (Oh my gosh)\n',
 'Oh my word oh my gosh oh my word (Oh my gosh)\n',
 'Oh my word oh my gosh oh my word (Oh my gosh)\n',
 'Everybody say kikiritikiki (kikiritikiki)\n',
 'Everybody say kikiritikiki (kikiritikiki)\n',
 'Everybody say kikiritik

### tokenizing our data

cleaning the data

In [9]:
def clean_song(song):
    clean_lines = [line for line in song if "[" not in line and "]" not in line]
    clean_song = " ".join(clean_lines)
    for symbol in ",.'?!()":
        clean_song = clean_song.replace(symbol, "")
    clean_song = clean_song.replace("\n", " ")
    return clean_song.lower()
    
clean_test_song = clean_song(test_song)
print(clean_test_song)

two wrongs dont make us right away  tell me somethings wrong  party all of our lives away  to take you on  oh baby i want you  baby i need you  i wanna see you  baby i wanna go out yeah  baby i wanna go out yeah  baby i want you  baby i need you  i wanna see you  baby i wanna go out yeah  baby i wanna go out yeah  all night all night all night  all night  your bodys on fire  and your drinks on ice  all night all night all night  all night  your bodys on fire  and your drinks on ice  oh my word oh my gosh oh my word oh my gosh  oh my word oh my gosh oh my word oh my gosh  oh my word oh my gosh oh my word oh my gosh  oh my word oh my gosh oh my word oh my gosh  everybody say kikiritikiki kikiritikiki  everybody say kikiritikiki kikiritikiki  everybody say kikiritikiki kikiritikiki  everybody say kikiritikiki kikiritikiki  ungbambe ungdedele ungbhasobhe unggudluke  ungbambe ungdedele ungbhasobhe unggudluke  ungbambe ungdedele ungbhasobhe unggudluke  ungbambe ungdedele ungbhasobhe unggudlu

In [10]:
tokenized_test_song= word_tokenize(clean_test_song) 
tokenized_test_song

['two',
 'wrongs',
 'dont',
 'make',
 'us',
 'right',
 'away',
 'tell',
 'me',
 'somethings',
 'wrong',
 'party',
 'all',
 'of',
 'our',
 'lives',
 'away',
 'to',
 'take',
 'you',
 'on',
 'oh',
 'baby',
 'i',
 'want',
 'you',
 'baby',
 'i',
 'need',
 'you',
 'i',
 'wan',
 'na',
 'see',
 'you',
 'baby',
 'i',
 'wan',
 'na',
 'go',
 'out',
 'yeah',
 'baby',
 'i',
 'wan',
 'na',
 'go',
 'out',
 'yeah',
 'baby',
 'i',
 'want',
 'you',
 'baby',
 'i',
 'need',
 'you',
 'i',
 'wan',
 'na',
 'see',
 'you',
 'baby',
 'i',
 'wan',
 'na',
 'go',
 'out',
 'yeah',
 'baby',
 'i',
 'wan',
 'na',
 'go',
 'out',
 'yeah',
 'all',
 'night',
 'all',
 'night',
 'all',
 'night',
 'all',
 'night',
 'your',
 'bodys',
 'on',
 'fire',
 'and',
 'your',
 'drinks',
 'on',
 'ice',
 'all',
 'night',
 'all',
 'night',
 'all',
 'night',
 'all',
 'night',
 'your',
 'bodys',
 'on',
 'fire',
 'and',
 'your',
 'drinks',
 'on',
 'ice',
 'oh',
 'my',
 'word',
 'oh',
 'my',
 'gosh',
 'oh',
 'my',
 'word',
 'oh',
 'my',
 'gos

### Vectorization

In [11]:
def count_vectorize(tokenized_song):
    unique_words = set(tokenized_song)

    song_dict = {word:0 for word in unique_words}

    for word in tokenized_song:
        song_dict[word] += 1

    return song_dict

test_vectorized = count_vectorize(tokenized_test_song)
print(test_vectorized)

{'unggudluke': 4, 'word': 8, 'everybody': 4, 'me': 1, 'need': 6, 'us': 1, 'lives': 1, 'ice': 6, 'oh': 17, 'kikiritikiki': 8, 'all': 25, 'tell': 1, 'right': 1, 'go': 13, 'want': 6, 'and': 6, 'bodys': 6, 'ungbambe': 4, 'take': 1, 'fire': 6, 'gosh': 8, 'party': 1, 'baby': 24, 'dont': 1, 'wrongs': 1, 'of': 1, 'i': 30, 'your': 12, 'see': 6, 'wan': 18, 'ungdedele': 4, 'to': 1, 'night': 24, 'up': 16, 'you': 19, 'out': 12, 'two': 1, 'wrong': 1, 'high': 16, 'ungbhasobhe': 4, 'say': 4, 'make': 1, 'my': 16, 'yeah': 12, 'away': 2, 'somethings': 1, 'na': 18, 'our': 1, 'we': 1, 'on': 13, 'drinks': 6}
