In [1]:
import os
import pandas as pd
import plotly.express as px

In [2]:
class Song:
    def __init__(self, file_path):

        file_name = file_path.split("/")[-1]
        self.side = 'No_Side'
        self.year = file_path.split("/")[1].split("_")[0]
        self.album = file_path.split("/")[1][4:].replace("_", " ")
        if 'A_Side' in file_path:
            self.side = 'A_Side'
        elif 'B_Side' in file_path:
            self.side = 'B_Side'
        track_number_and_title = (file_name.split(".")[0])
        self.track_number = int(track_number_and_title.split("_")[0])
        self.title = " ".join(track_number_and_title.split("_"))[1:]
        self.text = open(file_path, 'r').read().replace("\n", " ")

    def __repr__(self):
        return self.title

In [3]:
class Album:
    def __init__(self, directory):

        directory_name = directory.split("/")[-1]

        self.title = directory_name[4:]
        self.year = directory_name[:4]
        self.sides = []
        self.track = {}

        self.load_songs_from_album_directory(directory)
        
        self.songs = [song for side in self.track.values() for song in side]

    def load_songs_from_album_directory(self, directory):
        files = list(os.walk(directory))

        self.sides = files[0][1]
        self.sides.sort()
        for side in self.sides:
            side_files = list(os.walk(directory + f"/{side}"))
            side_files[0][2].sort()
            self.track[side] = [Song(directory + f"/{side}/" + song_file) for song_file in side_files[0][2]]

        if files[0][2] != []:
            self.sides = ['No_Side']
            files[0][2].sort()
            self.track['No_Side'] = [Song(directory + "/" + song_file) for song_file in files[0][2]]
        self.sides.sort()


    def side_as_str(self, side):
        return "".join([f'\n>>>>>> {song}' for song in self.track[side]])


    def __repr__(self):
        headline = f"{self.year:<4}: {self.title}"
        sides = "".join([f"\n>>> {side} {self.side_as_str(side)}" for side in self.sides])
        return headline + sides
a = Album("data/1967_David_Bowie")

In [4]:
class BowieData:
    def __init__(self, directory):
        # gets the file paths for all the albums in the directory
        albums = list(map(lambda album: directory + album,list(os.walk(directory))[0][1]))
        albums.sort()
        self.albums = [Album(album) for album in albums]
        songs = [album.songs for album in self.albums]
        self.songs = [song for album in songs for song in album]

In [69]:
data = BowieData("data/")

[ Uncle Arthur,
  Sell Me a Coat,
  Rubber Band,
  Love You till Tuesday,
  There Is a Happy Land,
  We Are Hungry Men,
  When I Live My Dream,
  Little Bombardier,
  Silly Boy Blue,
  Come and Buy My Toys,
  Join the Gang,
  Shes Got Medals,
  Maid of Bond Street,
  Please Mr Gravedigger]

In [68]:
df = pd.DataFrame()
df['Song Title'] = data.songs
df['Index'] = df.index
df['Year'] = [song.year for song in data.songs]
df['Album'] = [song.album for song in data.songs]
df['Word Count'] = [len(song.text.split(" ")) for song in data.songs]
df['Unique Words'] = [len(set(song.text.split(" "))) for song in data.songs]
df['Percent Unique'] = df['Unique Words']/df['Word Count']
df['Average Word Length'] = [len(song.text.replace(" ", "")) for song in data.songs]/df['Word Count']
df[df['Index'] > 100]

Unnamed: 0,Song Title,Index,Year,Album,Word Count,Unique Words,Percent Unique,Average Word Length
101,Joe the Lion,101,1977b,b Heroes,232,105,0.452586,3.551724
102,Heroes,102,1977b,b Heroes,272,97,0.356618,3.750000
103,Sons of the Silent Age,103,1977b,b Heroes,242,111,0.458678,3.847107
104,Blackout,104,1977b,b Heroes,234,119,0.508547,3.991453
105,V 2 Schneider,105,1977b,b Heroes,4,2,0.500000,6.000000
...,...,...,...,...,...,...,...,...
247,Lazarus,247,2016,Blackstar,124,73,0.588710,3.588710
248,Sue Or in a Season of Crime,248,2016,Blackstar,151,96,0.635762,3.675497
249,Girl Loves Me,249,2016,Blackstar,235,85,0.361702,3.953191
250,Dollar Days,250,2016,Blackstar,202,77,0.381188,4.118812


In [81]:
df2 = pd.DataFrame()
albums = df['Album'].unique()
df2['Album'] = albums
df2['Index'] = df2.index
df2['Word Count'] = [df[df['Album'] == album]['Word Count'].sum() for album in albums]
df2['Song Count'] = [df[df['Album'] == album].shape[0] for album in albums]
df2['Average Word Count Per Song'] = df2['Word Count']/df2['Song Count']
df2['Unique Words'] = [len(set((" ".join([song.text for song in album.songs])).split(" "))) for album in data.albums]
df2['Percent Unique'] = df2['Unique Words']/df2['Word Count']
df2

Unnamed: 0,Album,Index,Word Count,Song Count,Average Word Count Per Song,Unique Words,Percent Unique
0,David Bowie,0,3158,14,225.571429,1100,0.348322
1,Space Oddity,1,2558,9,284.222222,929,0.363174
2,The Man Who Sold the World,2,1855,9,206.111111,684,0.368733
3,Hunky Dory,3,2817,11,256.090909,869,0.308484
4,The Rise and Fall of Ziggy Stardust and the S...,4,2860,11,260.0,785,0.274476
5,Aladdin Sane,5,2411,10,241.1,777,0.322273
6,Diamond Dogs,6,2641,11,240.090909,846,0.320333
7,Young Americans,7,2986,8,373.25,774,0.25921
8,Station to Station,8,2113,6,352.166667,487,0.230478
9,a Low,9,504,11,45.818182,200,0.396825


In [84]:
fig = px.line(df2, x="Index", y="Word Count", title='Word Count')
fig.show()

In [85]:
fig = px.line(df2, x="Index", y="Percent Unique", title='Percent Unique Words')
fig.show()