In [14]:
import os
import pandas as pd
import plotly.express as px

In [15]:
class Song:
    def __init__(self, file_path):

        file_name = file_path.split("/")[-1]
        self.side = 'No_Side'
        self.year = file_path.split("/")[1].split("_")[0]
        self.album = file_path.split("/")[1][4:].replace("_", " ")
        if 'A_Side' in file_path:
            self.side = 'A_Side'
        elif 'B_Side' in file_path:
            self.side = 'B_Side'
        track_number_and_title = (file_name.split(".")[0])
        self.track_number = int(track_number_and_title.split("_")[0])
        self.title = " ".join(track_number_and_title.split("_"))[1:]
        self.text = open(file_path, 'r').read().replace("\n", " ")

    def __repr__(self):
        return self.title

In [16]:
class Album:
    def __init__(self, directory):

        directory_name = directory.split("/")[-1]

        self.title = directory_name[4:]
        self.year = directory_name[:4]
        self.sides = []
        self.track = {}

        self.load_songs_from_album_directory(directory)
        
        self.songs = [song for side in self.track.values() for song in side]

    def load_songs_from_album_directory(self, directory):
        files = list(os.walk(directory))

        self.sides = files[0][1]
        self.sides.sort()
        for side in self.sides:
            side_files = list(os.walk(directory + f"/{side}"))
            side_files[0][2].sort()
            self.track[side] = [Song(directory + f"/{side}/" + song_file) for song_file in side_files[0][2]]

        if files[0][2] != []:
            self.sides = ['No_Side']
            files[0][2].sort()
            self.track['No_Side'] = [Song(directory + "/" + song_file) for song_file in files[0][2]]
        self.sides.sort()


    def side_as_str(self, side):
        return "".join([f'\n>>>>>> {song}' for song in self.track[side]])


    def __repr__(self):
        headline = f"{self.year:<4}: {self.title}"
        sides = "".join([f"\n>>> {side} {self.side_as_str(side)}" for side in self.sides])
        return headline + sides
a = Album("data/1967_David_Bowie")

In [17]:
class BowieData:
    def __init__(self, directory):
        # gets the file paths for all the albums in the directory
        albums = list(map(lambda album: directory + album,list(os.walk(directory))[0][1]))
        albums.sort()
        self.albums = [Album(album) for album in albums]
        songs = [album.songs for album in self.albums]
        self.songs = [song for album in songs for song in album]

In [25]:
data = BowieData("data/")
df = pd.DataFrame()
df['Song Title'] = data.songs
df['Index'] = df.index
df['Year'] = [song.year for song in data.songs]
df['Album'] = [song.album for song in data.songs]
df['Word Count'] = [len(song.text.split(" ")) for song in data.songs]
df['Unique Words'] = [len(set(song.text.split(" "))) for song in data.songs]
df['Percent Unique'] = df['Unique Words']/df['Word Count']
df.head(10)

Unnamed: 0,Song Title,Index,Year,Album,Word Count,Unique Words,Percent Unique
0,Uncle Arthur,0,1967,David Bowie,204,125,0.612745
1,Sell Me a Coat,1,1967,David Bowie,234,78,0.333333
2,Rubber Band,2,1967,David Bowie,177,106,0.59887
3,Love You till Tuesday,3,1967,David Bowie,287,111,0.38676
4,There Is a Happy Land,4,1967,David Bowie,224,141,0.629464
5,We Are Hungry Men,5,1967,David Bowie,311,156,0.501608
6,When I Live My Dream,6,1967,David Bowie,280,138,0.492857
7,Little Bombardier,7,1967,David Bowie,206,130,0.631068
8,Silly Boy Blue,8,1967,David Bowie,168,86,0.511905
9,Come and Buy My Toys,9,1967,David Bowie,182,104,0.571429


In [28]:
fig = px.scatter(df, x="Year", y="Percent Unique", title='Word Count Over Time')
fig.show()

2