In [1]:
from ast import literal_eval
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("./data/complete_tracks_with_lyrics.csv")

In [3]:
data.head()

Unnamed: 0,artist,album,track,lyrics,word_list
0,BROCKHAMPTON,iridescence,WEIGHT,[Verse 1: Kevin Abstact]\nThey split my world ...,"['they', 'split', 'my', 'world', 'into', 'piec..."
1,BROCKHAMPTON,iridescence,VIVID,"[Intro: Matt Champion]\n""Yo, get—[censored]—tu...","['yo', 'getcensoredturn', 'that', 'shit', 'ove..."
2,BROCKHAMPTON,iridescence,TAPE,"[Verse 1: Kevin Abstract]\nI can barely rap, I...","['i', 'can', 'barely', 'rap', 'i', 'can', 'bar..."
3,BROCKHAMPTON,Saturation III,STAINS,[Verse 1: Ameer Vann]\nI spent like a year and...,"['i', 'spent', 'like', 'a', 'year', 'and', 'a'..."
4,BROCKHAMPTON,iridescence,DISTRICT,"[Intro]\n""I'm Sammy Jo, and my favorite colors...","[""i'm"", 'sammy', 'jo', 'and', 'my', 'favorite'..."


In [4]:
word_list_by_artist = {}

for artist in data["artist"].unique():
    for idx, row in data[data["artist"] == artist].iterrows():
        if row["artist"] in word_list_by_artist:
            word_list_by_artist[row["artist"]]["word_list"].extend(literal_eval(row["word_list"]))
        else:
            word_list_by_artist[row["artist"]] = {
                "artist": row["artist"],
                "word_list": literal_eval(row["word_list"])
            }

In [5]:
columns = ["artist", "normalised_word_count", "normalised_unique_word_count", "1st_word", "2nd_word", "3rd_word", "4th_word", "5th_word", "6th_word", "7th_word", "8th_word", "9th_word", "10th_word"]
stats_df = pd.DataFrame(columns=columns)
stats_df

for key, value in word_list_by_artist.items():

    word_list = value["word_list"]
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]

    normalised_word_count = len(filtered_words)
    normalised_unique_word_count = len(set(filtered_words))
    vocab_richness = normalised_unique_word_count / normalised_word_count
    top_10_words = Counter(filtered_words).most_common(10)

    new_row = {
        "artist": key,
        "normalised_word_count": normalised_word_count,
        "normalised_unique_word_count": normalised_unique_word_count,
        "vocab_richness": vocab_richness,
        "1st_word": top_10_words[0][0],
        "2nd_word": top_10_words[1][0],
        "3rd_word": top_10_words[2][0],
        "4th_word": top_10_words[3][0],
        "5th_word": top_10_words[4][0],
        "6th_word": top_10_words[5][0],
        "7th_word": top_10_words[6][0],
        "8th_word": top_10_words[7][0],
        "9th_word": top_10_words[8][0],
        "10th_word": top_10_words[9][0]
    }

    stats_df = stats_df.append(new_row, ignore_index=True)

In [6]:
stats_df.sort_values(by=["normalised_word_count"], inplace=True, ascending=False)
stats_df[["artist", "normalised_word_count"]]

Unnamed: 0,artist,normalised_word_count
2,Eminem,7564
11,Migos,7438
6,Kendrick Lamar,6836
7,Kevin Gates,6356
5,J. Cole,6003
13,NF,5934
3,Future,5933
12,Moneybagg Yo,5730
8,Kodak Black,5632
17,YoungBoy Never Broke Again,5610


In [7]:
stats_df.sort_values(by=["vocab_richness"], inplace=True, ascending=False)
stats_df[["artist", "vocab_richness"]]

Unnamed: 0,artist,vocab_richness
2,Eminem,0.354178
0,BROCKHAMPTON,0.347293
6,Kendrick Lamar,0.337039
15,Russ,0.306719
1,Drake,0.303333
4,Gucci Mane,0.28547
5,J. Cole,0.283192
7,Kevin Gates,0.281309
12,Moneybagg Yo,0.268586
3,Future,0.26833


In [17]:
stats_df[["artist","1st_word","2nd_word","3rd_word","4th_word","5th_word","6th_word","7th_word","8th_word","9th_word","10th_word"]]

Unnamed: 0,artist,1st_word,2nd_word,3rd_word,4th_word,5th_word,6th_word,7th_word,8th_word,9th_word,10th_word
2,Eminem,i'm,like,get,back,'cause,la,go,shit,know,never
0,BROCKHAMPTON,head,like,i'm,know,get,got,that's,ain't,tell,can't
6,Kendrick Lamar,i'm,nigga,like,know,ain't,feel,gon',black,got,alright
15,Russ,i'm,yeah,got,like,know,back,fuck,ain't,shit,i'll
1,Drake,like,i'm,know,yeah,got,niggas,get,shit,ain't,wanna
4,Gucci Mane,i'm,got,gucci,like,nigga,ain't,money,bales,back,way
5,J. Cole,i'm,niggas,count,nigga,like,know,got,see,wanna,shit
7,Kevin Gates,i'm,love,got,one,ain't,get,know,say,like,nigga
12,Moneybagg Yo,i'm,like,got,ain't,fuck,shit,get,real,nigga,niggas
3,Future,i'm,know,yeah,fuck,got,like,get,niggas,keep,nigga


In [39]:
word_columns = ["1st_word","2nd_word","3rd_word","4th_word","5th_word","6th_word","7th_word","8th_word","9th_word","10th_word"]

unique_words = list(set(stats_df[columns].values.flatten()))
words_by_artists = {}

for idx, row in stats_df.iterrows():
    for word in row[word_columns]:
        if word in words_by_artists:
            words_by_artists[word].append(row["artist"])
        else:
            words_by_artists[word] = [row["artist"]] 

In [40]:
words_by_artists

{"'cause": ['Eminem', 'Wiz Khalifa', 'Kodak Black'],
 "ain't": ['BROCKHAMPTON',
  'Kendrick Lamar',
  'Russ',
  'Drake',
  'Gucci Mane',
  'Kevin Gates',
  'Moneybagg Yo',
  'Wiz Khalifa',
  'Kodak Black',
  'NF',
  'Post Malone',
  'YoungBoy Never Broke Again',
  'Lil Uzi Vert'],
 'alright': ['Kendrick Lamar'],
 'baby': ['Kodak Black'],
 'back': ['Eminem', 'Russ', 'Gucci Mane', 'Lil Yachty'],
 'bales': ['Gucci Mane'],
 'big': ['Migos'],
 'bitch': ['Lil Yachty', 'Migos', 'YoungBoy Never Broke Again'],
 'black': ['Kendrick Lamar'],
 "can't": ['BROCKHAMPTON', 'Lil Uzi Vert'],
 'count': ['J. Cole'],
 'culture': ['Migos'],
 'diamonds': ['Lil Uzi Vert'],
 'feel': ['Kendrick Lamar', 'NF'],
 'fuck': ['Russ', 'Moneybagg Yo', 'Future', 'Lil Yachty'],
 'gang': ['Migos'],
 'get': ['Eminem',
  'BROCKHAMPTON',
  'Drake',
  'Kevin Gates',
  'Moneybagg Yo',
  'Future',
  'Migos',
  'Wiz Khalifa',
  'NF'],
 'girl': ['Lil Uzi Vert'],
 'go': ['Eminem', 'Migos'],
 "gon'": ['Kendrick Lamar'],
 'got': ['BR

In [50]:
words_by_artists_sorted_by_artist_count = [(word, words_by_artists[word]) for word in sorted(words_by_artists, key=lambda word: len(words_by_artists[word]))]

for word, artists in words_by_artists_sorted_by_artist_count:
    if len(artists) == 1:
        print(word, artists)

la ['Eminem']
head ['BROCKHAMPTON']
that's ['BROCKHAMPTON']
tell ['BROCKHAMPTON']
gon' ['Kendrick Lamar']
black ['Kendrick Lamar']
alright ['Kendrick Lamar']
i'll ['Russ']
gucci ['Gucci Mane']
money ['Gucci Mane']
bales ['Gucci Mane']
way ['Gucci Mane']
count ['J. Cole']
see ['J. Cole']
one ['Kevin Gates']
real ['Moneybagg Yo']
harley ['Lil Yachty']
name ['Lil Yachty']
culture ['Migos']
gang ['Migos']
big ['Migos']
hard ['Wiz Khalifa']
baby ['Kodak Black']
ooh ['Post Malone']
loaded ['Lil Uzi Vert']
diamonds ['Lil Uzi Vert']
girl ['Lil Uzi Vert']


In [56]:
for word, artists in words_by_artists_sorted_by_artist_count:
    if len(artists) == 18:
        print(word)

i'm
like


In [57]:
stats_df.shape

(18, 14)