# Smiths Project 

**RQ: What are the main recurring themes, emotional patterns, and linguistic characteristics present in The Smiths' lyrics, and how do these evolve and perform over their Albums?**

Steps taken:
1. **Data Collection**
   - **Objective**: Gather a comprehensive collection of The Smiths'song and Albums
   - **Challenges**: The Smiths Discography is littered with junk. Struggled with getting just thier actualy songs for a while. The only solution that worked was specifying a list of albums that I wanted to look into. 
2. **Data Cleaning and Preprocessing**
   - **Objective**: Clean and prepare the lyrics data for analysis.
3. **Data Categorization and Theme Identification**
   - **Objective**: Identify main recurring themes in The Smiths’ lyrics.
4. **Sentiment Analysis**
   - **Objective**: Determine the sentiment expressed in each song.
5. **Visualization Setup**
   - **Objective**: Visualize the themes and sentiment changes over time.
6. **Analyze and Interpret Results**
   - **Objective**: Deeply interpret the themes and sentiments to uncover insights.
7. **Final Review and Refinement**
   - **Objective**: Ensure accuracy and add context.
8. **Documentation and Writeup**
   - **Objective**: Document your findings and compile the results into a comprehensive report.

In [10]:
# Import neccessary libraries

import regex as re
import lyricsgenius
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup

# Define real list of albums since most of the smiths catalog is Live albums and compilations or Movies
ALBUMS = ['The Smiths', 'Meat Is Murder', 'The Queen Is Dead', 'Strangeways, Here We Come', 'Louder Than Bombs', 'Hateful of Hallow']

In [11]:
# Get access token from API's

def get_access_token(url, client_id, client_secret):
    url = url
    data = {
        'client_id': client_id,
        'client_secret': client_secret,
        'grant_type': 'client_credentials'
    }
    response = requests.post(url, data=data)
    response_data = response.json()
    if response.status_code != 200:
        print('Error:', response_data['error'])
        return None
    else:
        print('Access Token:', response_data['access_token'])
        return response_data['access_token']
    

load_dotenv('3510.env')
genius_client_id = os.getenv('GENIUS_CLIENT_ID')
genius_client_secret = os.getenv('GENIUS_SECRET')
genius_access_token = get_access_token(url='https://api.genius.com/oauth/token', client_id=genius_client_id, client_secret=genius_client_secret)

spotify_client_id = os.getenv("SPOTIFY_CLIENT_ID")
spotify_client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")

spotify_access_token = get_access_token(url='https://accounts.spotify.com/api/token', client_id=spotify_client_id, client_secret=spotify_client_secret)

Access Token: kTIHUB-Zwlh8EN0j5H6IgPeXQHjFlV_gB-rpRYfOHHjRyklC9Ry4sSuQma5aABGs
Access Token: BQAiSBWPZTZms8pH6PZIMRD3GK83kHpvO16_AY8wWWzGPxV7HDnRY7x8rbdLElXU21K5_phCxERKPYUvnFCjorLuSbfphAomM-FC0_za6P_wZvhxP_k


In [12]:
class Album:
    """ 
    This class represents an album. 

    Attributes:
    name (str): The name of the album.
    artist (str): The artist of the album.
    album_id (str): The album's unique identifier.
    release_date (str): The album's release date.
    total_tracks (int): The total number of tracks on the album.
    songs (list): A list of songs on the album.
    """
    def __init__(self, name, artist, album_id, release_date, total_tracks):
        self.name = name
        self.artist = artist
        self.album_id = album_id
        self.release_date = release_date
        self.total_tracks = total_tracks
        self.songs = []

    def add_song(self, song):
        """
        Add a song to the album.
        """
        self.songs.append(song)


class Song:
    """
    This class represents a song. The song is automatically added to the album's song list when created.

    Attributes:
    album (str): The album the song belongs to.
    name (str): The name of the song.
    artist (str): The artist of the song.
    song_id (str): The song's unique identifier.
    duration_ms (int): The duration of the song in milliseconds.
    explicit (bool): Whether the song contains explicit content.
    track_number (int): The track number of the song on the album.
    lyrics (str): The lyrics of the song.
    """
    def __init__(self, album, name, artist, song_id, duration_ms, explicit, track_number):
        self.album = album
        self.name = name
        self.artist = artist
        self.song_id = song_id
        self.duration_ms = duration_ms
        self.explicit = explicit
        self.track_number = track_number
        self.lyrics = None

        # Automatically add the song to the album's song list
        album.add_song(self)


In [13]:
# Get Albums from SpotifyAPI

rootURL = 'https://api.spotify.com/v1/'
headers = {"Authorization":"Bearer "+spotify_access_token}
smiths_search = requests.get(rootURL+'search?q=TheSmiths&type=artist',headers=headers).json()
smiths_id = smiths_search['artists']['items'][0]['id']
smiths_albums = requests.get(rootURL+'artists/'+smiths_id+'/albums',headers=headers).json()

In [14]:
# Assign albums and attributes to Album class

smiths_album_names = [album['name'] for album in smiths_albums['items'] if album['name'] in ALBUMS]
smiths_album_ids = [album['id'] for album in smiths_albums['items'] if album['name'] in ALBUMS]
smithAlbums = []

for i in range(len(smiths_album_names)):
    album = Album(name=smiths_album_names[i], 
                  artist='The Smiths', 
                  album_id=smiths_album_ids[i], 
                  release_date=smiths_albums['items'][i]['release_date'],
                  total_tracks=smiths_albums['items'][i]['total_tracks'])
    smithAlbums.append(album)

In [15]:
# Get songs from each album and assign to Song class
smith_songs_by_album = {}

for i in range(len(smiths_album_ids)):
    all_songs = []
    url = rootURL+'albums/'+smiths_album_ids[i]+'/tracks'
    songs = requests.get(url,headers=headers).json()
    for song in songs['items']:
        album = smithAlbums[i]
        song_name = song['name'].split(' - ')[0] # remove the extra info from the song name
        song_id = song['id']
        duration_ms = song['duration_ms']
        explicit = song['explicit']
        track_number = song['track_number']
        song = Song(album=album,
                    name=song_name, 
                    artist='The Smiths', 
                    song_id=song_id, 
                    duration_ms=duration_ms, 
                    explicit=explicit, 
                    track_number=track_number)
        all_songs.append(song.name)


In [16]:
# Get lyrics for each song

for i,album in enumerate(smithAlbums):
    for j,song in enumerate(album.songs):
        try:
            genius = lyricsgenius.Genius(genius_access_token)
            geniusSong = genius.search_song(song.name, "The Smiths")
            song.lyrics = geniusSong.lyrics
        except:
            song.lyrics = None
            print("Lyrics not found for", song.name)


Searching for "A Rush and a Push and the Land Is Ours" by The Smiths...
Done.
Searching for "I Started Something I Couldn't Finish" by The Smiths...
Done.
Searching for "Death of a Disco Dancer" by The Smiths...
Done.
Searching for "Girlfriend in a Coma" by The Smiths...
Done.
Searching for "Stop Me If You Think You've Heard This One Before" by The Smiths...
Done.
Searching for "Last Night I Dreamt That Somebody Loved Me" by The Smiths...
Done.
Searching for "Unhappy Birthday" by The Smiths...
Done.
Searching for "Paint a Vulgar Picture" by The Smiths...
Done.
Searching for "Death at One's Elbow" by The Smiths...
Done.
Searching for "I Won't Share You" by The Smiths...
Done.
Searching for "The Queen Is Dead" by The Smiths...
Done.
Searching for "Frankly, Mr. Shankly" by The Smiths...
Done.
Searching for "I Know It's Over" by The Smiths...
Done.
Searching for "Never Had No One Ever" by The Smiths...
Done.
Searching for "Cemetry Gates" by The Smiths...
Done.
Searching for "Bigmouth Strik

In [27]:
# Create a DataFrame and save to CSV so I dont have to wait 8 minutes to get the data again
albums_df = pd.DataFrame(columns=['Album Name', 'Artist', 'Release Date', 'Total Tracks', 'Song Name', 'Duration (ms)', 'Explicit', 'Track Number', 'Lyrics'])
for album in smithAlbums:
    for song in album.songs:
        if song.lyrics is None:
            continue
        else:
            song_data = pd.DataFrame([[album.name, song.artist, album.release_date, album.total_tracks, song.name, song.duration_ms, song.explicit, song.track_number, song.lyrics]], columns=['Album Name', 'Artist', 'Release Date', 'Total Tracks', 'Song Name', 'Duration (ms)', 'Explicit', 'Track Number', 'Lyrics'])
            albums_df = pd.concat([albums_df, song_data], ignore_index=True)
albums_df.to_csv('smiths_albums.csv', index=False)

In [28]:
# Clean up the lyrics column 
for index, row in albums_df.iterrows():
    try:
        # Handle columns with and without brackets diffrently 
        if '[' in row['Lyrics']:
            # Split on only the first occurrence of ']' and keep only the second part
            lyrics = row['Lyrics'].split(']', 1)[1]
            # Split on only the last occurrence of '[' and keep only the first part
            lyrics = lyrics.rsplit('[', 1)[0]
            # Use regex to remove all text in brackets
            pattern = r'\[.*?\]'
            lyrics = re.sub(pattern, ' ', lyrics)

            # Alter the DataFrame
            albums_df.at[index, 'Lyrics'] = lyrics
        else: 
            # Split on only the first occurrence of 'Lyrics' and keep only the second part
            lyrics = row['Lyrics'].split('Lyrics', 1)[1]
            # Split on only the last occurrence of 'E' for Embedded and keep only the first part
            lyrics = lyrics.rsplit('E', 1)[0]
            albums_df.at[index, 'Lyrics'] = lyrics
    except:
        print('Error cleaning lyrics for', row['Song Name'])
        pass

In [29]:
# Clean up the lyrics column 

for index, row in albums_df.iterrows():
    try:
        # Handle columns with and without brackets diffrently 
        if '[' in row['Lyrics']:
            # Split on only the first occurrence of ']' and keep only the second part
            lyrics = row['Lyrics'].split(']', 1)[1]
            # Split on only the last occurrence of '[' and keep only the first part
            lyrics = lyrics.rsplit('[', 1)[0]
            # Use regex to remove all text in brackets
            pattern = r'\[.*?\]'
            lyrics = re.sub(pattern, ' ', lyrics)

            # Alter the DataFrame
            albums_df.at[index, 'Lyrics'] = lyrics
        else: 
            # Split on only the first occurrence of 'Lyrics' and keep only the second part
            lyrics = row['Lyrics'].split('Lyrics', 1)[1]
            # Split on only the last occurrence of 'E' for Embedded and keep only the first part
            lyrics = lyrics.rsplit('E', 1)[0]
            albums_df.at[index, 'Lyrics'] = lyrics
    except:
        print('Error cleaning lyrics for', row['Song Name'])
        pass

Error cleaning lyrics for A Rush and a Push and the Land Is Ours
Error cleaning lyrics for I Started Something I Couldn't Finish
Error cleaning lyrics for Death of a Disco Dancer
Error cleaning lyrics for Girlfriend in a Coma
Error cleaning lyrics for Stop Me If You Think You've Heard This One Before
Error cleaning lyrics for Last Night I Dreamt That Somebody Loved Me
Error cleaning lyrics for Unhappy Birthday
Error cleaning lyrics for Paint a Vulgar Picture
Error cleaning lyrics for Death at One's Elbow
Error cleaning lyrics for I Won't Share You
Error cleaning lyrics for The Queen Is Dead
Error cleaning lyrics for Frankly, Mr. Shankly
Error cleaning lyrics for I Know It's Over
Error cleaning lyrics for Never Had No One Ever
Error cleaning lyrics for Cemetry Gates
Error cleaning lyrics for Bigmouth Strikes Again
Error cleaning lyrics for The Boy with the Thorn in His Side
Error cleaning lyrics for Vicar in a Tutu
Error cleaning lyrics for There Is a Light That Never Goes Out
Error cle

In [30]:
# Clean lyrics column further not based on any conditions

albums_df['Lyrics'] = albums_df['Lyrics'].str.replace('\n', ' ')

# Use regex to remove descriptors in the lyrics (*applause*)
pattern = r'\*.*?\*'
albums_df['Lyrics'] = albums_df['Lyrics'].apply(lambda x: re.sub(pattern, ' ', x) if x is not None else None)

In [31]:
# Tokenize and remove stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


stop_words = set(stopwords.words('english'))

# Define a function to tokenize and remove stop words
def tokenize_and_remove_stopwords(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

# Apply the function to the 'Lyrics' column
albums_df['Tokenized Lyrics'] = albums_df['Lyrics'].apply(tokenize_and_remove_stopwords)


In [32]:
# albums_df.to_csv('smiths_albums_cleaned.csv', index=False)
# albums_df = pd.read_csv('smiths_albums_cleaned.csv')

In [42]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Example of tokenized and cleaned lyrics
lyrics = albums_df['Tokenized Lyrics'][albums_df['Song Name'] == "Heaven Knows I'm Miserable Now"].tolist()[0]
print(lyrics)
# Combine tokens into a single string
lyrics_str = " ".join(lyrics)

# Get sentiment scores
sentiment = analyzer.polarity_scores(lyrics_str)

print(f"Sentiment Scores: {sentiment}")


['happy', 'haze', 'drunken', 'hour', 'heaven', 'knows', "'m", 'miserable', 'looking', 'job', 'found', 'job', 'heaven', 'knows', "'m", 'miserable', 'life', ',', 'give', 'valuable', 'time', 'people', "n't", 'care', 'live', 'die', '?', 'Two', 'lovers', 'entwined', 'pass', 'heaven', 'knows', "'m", 'miserable', 'looking', 'job', 'found', 'job', 'heaven', 'knows', "'m", 'miserable', 'life', ',', 'oh', ',', 'give', 'valuable', 'time', 'people', "n't", 'care', 'live', 'die', '?', 'asked', 'end', 'day', 'Caligula', 'would', 'blushed', '``', 'Oh', ',', "'ve", 'house', 'long', ',', "''", 'said', 'naturally', 'fled', 'might', 'also', 'like', 'life', ',', 'smile', 'people', "'d", 'much', 'rather', 'Kick', 'eye', '?', 'happy', 'haze', 'drunken', 'hour', 'heaven', 'knows', "'m", 'miserable', '``', 'Oh', ',', "'ve", 'house', 'long', ',', "''", 'said', 'naturally', 'fled']
Sentiment Scores: {'neg': 0.141, 'neu': 0.558, 'pos': 0.301, 'compound': 0.9752}


In [46]:
import pprint as p
p.pprint(lyrics)

['happy',
 'haze',
 'drunken',
 'hour',
 'heaven',
 'knows',
 "'m",
 'miserable',
 'looking',
 'job',
 'found',
 'job',
 'heaven',
 'knows',
 "'m",
 'miserable',
 'life',
 ',',
 'give',
 'valuable',
 'time',
 'people',
 "n't",
 'care',
 'live',
 'die',
 '?',
 'Two',
 'lovers',
 'entwined',
 'pass',
 'heaven',
 'knows',
 "'m",
 'miserable',
 'looking',
 'job',
 'found',
 'job',
 'heaven',
 'knows',
 "'m",
 'miserable',
 'life',
 ',',
 'oh',
 ',',
 'give',
 'valuable',
 'time',
 'people',
 "n't",
 'care',
 'live',
 'die',
 '?',
 'asked',
 'end',
 'day',
 'Caligula',
 'would',
 'blushed',
 '``',
 'Oh',
 ',',
 "'ve",
 'house',
 'long',
 ',',
 "''",
 'said',
 'naturally',
 'fled',
 'might',
 'also',
 'like',
 'life',
 ',',
 'smile',
 'people',
 "'d",
 'much',
 'rather',
 'Kick',
 'eye',
 '?',
 'happy',
 'haze',
 'drunken',
 'hour',
 'heaven',
 'knows',
 "'m",
 'miserable',
 '``',
 'Oh',
 ',',
 "'ve",
 'house',
 'long',
 ',',
 "''",
 'said',
 'naturally',
 'fled']
