## Popular Music Lyrics Analysis

In [None]:
# library
import sys, os
import pandas as pd
from langdetect import detect

# classes
from lyrics_tool import Lyrics_Tool

In [None]:
"""
use pandas to read csv files from directory
join all datasets into final df variable
"""
dataset_folder_path = 'datasets/'

dataset_1 = pd.read_csv(dataset_folder_path+'top2018.csv')[['name','artists']]
dataset_1.columns = ['name','artist']

dataset_2 = pd.read_csv(dataset_folder_path+'top50.csv',encoding='latin-1')[['Track.Name','Artist.Name']]
dataset_2.columns = ['name','artist']

dataset_3 = pd.read_csv(dataset_folder_path+'data.csv')[['song_title','artist']]
dataset_3.columns = ['name','artist']

dataset_4 = pd.read_csv(dataset_folder_path+'top10s.csv',encoding='latin-1')[['title','artist']]
dataset_4.columns = ['name','artist']

# join datasets together into one master dataframe
df = pd.concat([dataset_1, dataset_2,dataset_3,dataset_4])

In [None]:
"""
clean data
remove parentheses to prevent problems with genius API
"""
# remove parentheses from song names
clean_song_names = []
for name in list(df['name']):
    if "(" in name:
        clean_value = name.split('(')[0][:-1]
        clean_song_names.append(clean_value)
    else:
        clean_song_names.append(name) 

# update name column with clean values
df['name'] = clean_song_names

In [None]:
"""
extract unique value counts from each song,
then append to unique_values_df
"""
# dataframe container for all unique values
unique_values_df = pd.DataFrame()

# two lists of song names and artists
song_names = list(df['name'])
artist_names = list(df['artist'])

# initialize lyrics tool
l = Lyrics_Tool()

# create unique_values_df
for i in range(len(song_names)):

    # extract one song name & artist
    song_name = song_names[i]
    artist_name = artist_names[i]
    # detect language of song
    try:   
        # tuple unpack lyrics sample & unique lyric count
        lyrics_unique_counts = l.unique_word_count(song_name,artist_name)
        words_sample = l.lyrics_sample(song_name,artist_name)

        # detect using langdetect
        language = detect(words_sample)          
    except:
        language = 'unknown'
    # if current song is english...
    if language == 'en':
        try:
            # add song data to master dataframe
            unique_values_df = pd.concat([unique_values_df,lyrics_unique_counts])

            # vet each song & update SQL database 
            l.vet(song_name, artist_name)                  
        except:
            print('Song not found in Genius database')
    else:
        pass

# set column names
unique_values_df.reset_index(inplace=True)

In [None]:
# library
import sys, os
import pandas as pd
from langdetect import detect

import sys
sys.path.insert(1, r'C:\Users\wesle\Desktop\__MASTER__\repo_2\lyrics-analyzer-master\scripts')

# classes
from lyrics_tool import Lyrics_Tool

""" 
this method cleans & joins multiple song datasets, 
counts how many times each word/lyric occurs. 
the result is a dataframe containing all unique words, sorted from most common to least common
"""

def lyrics_analyzer():  
    
    # Disable
    def blockPrint():
        sys.stdout = open(os.devnull, 'w')
    
    blockPrint()
    
    """
    use pandas to read csv files from directory
    join all datasets into final df variable
    """
    dataset_folder_path = r'C:\\Users\\wesle\\Desktop\\__MASTER__\repo_2\lyrics-analyzer-master\datasets'

    dataset_1 = pd.read_csv(dataset_folder_path+'top2018.csv')[['name','artists']]
    dataset_1.columns = ['name','artist']

    dataset_2 = pd.read_csv(dataset_folder_path+'top50.csv',encoding='latin-1')[['Track.Name','Artist.Name']]
    dataset_2.columns = ['name','artist']
    
    dataset_3 = pd.read_csv(dataset_folder_path+'data.csv')[['song_title','artist']]
    dataset_3.columns = ['name','artist']
    
    dataset_4 = pd.read_csv(dataset_folder_path+'top10s.csv',encoding='latin-1')[['title','artist']]
    dataset_4.columns = ['name','artist']
    
    # join datasets together into one master dataframe
    df = pd.concat([dataset_1, dataset_2,dataset_3,dataset_4])
    
    
    """
    clean data
    remove parentheses to prevent problems with genius API
    """
    # remove parentheses from song names
    clean_song_names = []
    for name in list(df['name']):
        if "(" in name:
            clean_value = name.split('(')[0][:-1]
            clean_song_names.append(clean_value)
        else:
            clean_song_names.append(name) 

    # update name column with clean values
    df['name'] = clean_song_names

    
    
    """
    extract unique value counts from each song,
    then append to unique_values_df
    """
    # dataframe container for all unique values
    unique_values_df = pd.DataFrame()
    
    # two lists of song names and artists
    song_names = list(df['name'])
    artist_names = list(df['artist'])
    
    # initialize lyrics tool
    l = LyricsTool()

    # create unique_values_df
    for i in range(len(song_names)):

        # extract one song name & artist
        song_name = song_names[i]
        artist_name = artist_names[i]
        # detect language of song
        try:   
            # tuple unpack lyrics sample & unique lyric count
            lyrics_unique_counts, words_sample = l.lyrics_data(song_name,artist_name)
            
            # detect using langdetect
            language = detect(words_sample)          
        except:
            language = 'unknown'
        # if current song is english...
        if language == 'en':
            try:
                # add song data to master dataframe
                unique_values_df = pd.concat([unique_values_df,lyrics_unique_counts])
                
                # vet each song & update SQL database 
                l.vet(song_name, artist_name)                  
            except:
                print('Song not found in Genius database')
        else:
            pass
   
    # set column names
    unique_values_df.reset_index(inplace=True)
    unique_values_df.columns = ['word', 'count']
    
    
    """
    create final summary dataframe
    all unique words, sorted from most common to least common
    """
    master_df = pd.DataFrame(columns=['word','count'])  
    
    # extract all unique words
    unique_words = list(unique_values_df['word'].unique())
    
    for word in unique_words:
        # return all rows with the current unique word
        data = unique_values_df[unique_values_df['word']==word]
        # only 1 instance of current word
        if data.shape[0] == 1:
            master_df = master_df.append(data)
        # multiple instances of current word
        else:
            # sum values across all songs
            word_value = data['word'].iloc[0]
            count_value = data['count'].sum() 
            summed_row = pd.DataFrame([[word_value],[count_value]],index=['word','count']).T
            master_df = master_df.append(summed_row)

    # reset index
    master_df.reset_index(inplace=True)
    master_df.drop('index',axis=1,inplace=True)

    # sort values
    master_df.sort_values(by='count',ascending=False, inplace=True)

    # set word column as index
    master_df.set_index('word',inplace=True)
    
    # export csv
    master_df.to_csv(dataset_folder_path+"LyricsAnalysis_1.csv")

    return master_df

In [None]:
master_df = lyrics_analyzer()

In [None]:
# plot histogram
master_df['count'].hist(bins=50)

### Check for explicit lyrics

In [None]:
# iterate through all rows in dataset
for i in range(len(df)):
    name = df['name'][i]
    artist = df['artist'][i]   

In [None]:
# clever compression of concepts
# perctentage clean
percent_clean = round(len(df[df['Clean']==1]) / len(df)*100,2)