In [19]:
def getSpotifyData():
    """Retrieves Spotify data from Github and performs the necessary wrangling. 
    Returns:
        pandas.DataFranme: A pandas data frame with the wrangle spotify data. 
    """    
    data = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
    data = data.dropna()
    data = data.drop(['track_id', 'track_album_id', 'playlist_id'], axis=1)
    data.columns = ["Name", "Artist", "Popularity", "Album Name", "Album Release Date", "Playlist Name", 
                "Playlist Genre", "Playlist Subgenre", "Danceability", 
               "Energy", "Key", "Loudness", "Mode", "Speechiness", 
               "Acousticness", "Instrumentalness", "Liveness", "Valence", 
              "Tempo", "Duration"]
    data['Playlist Genre'] = data['Playlist Genre'].str.title()
    data['Playlist Subgenre'] = data['Playlist Subgenre'].str.title()
    mode = {1 : 'Major', 0:'Minor'}
    key = {0 : 'C', 1:'C#', 2: 'D', 3:'D#', 4: 'E', 5:'F', 6: 'F#', 7:'G', 
           8: 'G#', 9:'A', 10: 'Bb', 11:'B'}
    data.replace({"Mode":mode}, inplace = True)
    data.replace({"Key":key}, inplace = True)
    data['Album Release Date'] =  pd.to_datetime(data['Album Release Date'], format='%Y-%m-%d')
    data['Year'] = data['Album Release Date'].apply(lambda x: x.year)
    data = data.drop(["Album Release Date", "Playlist Name", "Danceability", 
               "Energy", "Key", "Loudness", "Mode", "Speechiness", 
               "Acousticness", "Instrumentalness", "Liveness", "Valence", 
              "Tempo", "Duration"], axis = 1)
    return data

In [16]:
import pandas as pd

In [58]:
data = getSpotifyData()

In [44]:
new_df = pd.DataFrame()
data_subset=data.groupby(["Name"]).mean()

In [71]:
data["Playlist Genre"].unique()

array(['Pop', 'Rap', 'Rock', 'Latin', 'R&B', 'Edm'], dtype=object)

In [89]:
data.loc[(data["Year"] == 1999) & (data["Playlist Genre"] == "Pop")]



Unnamed: 0,Name,Artist,Popularity,Album Name,Playlist Genre,Playlist Subgenre,Year
751,Blue (Da Ba Dee),Eiffel 65,16,Europop,Pop,Dance Pop,1999
758,...Baby One More Time,Britney Spears,75,...Baby One More Time (Digital Deluxe Version),Pop,Dance Pop,1999
766,(You Drive Me) Crazy,Britney Spears,2,Baby One More Time,Pop,Dance Pop,1999
788,Back in My Life,Alice DJ,45,Back in My Life,Pop,Dance Pop,1999
795,Beautiful Life,Ace of Base,4,Singles of the 90s,Pop,Dance Pop,1999
809,Better Off Alone - Hitradio Mix,Alice Deejay,3,Better Off Alone,Pop,Dance Pop,1999
836,The Launch - Radio Edit,DJ Jean,53,The Launch,Pop,Dance Pop,1999
1707,What's My Age Again?,blink-182,13,Enema Of The State,Pop,Post-Teen Pop,1999
1708,All The Small Things,blink-182,17,Enema Of The State,Pop,Post-Teen Pop,1999
1755,All Star,Smash Mouth,79,Astro Lounge,Pop,Post-Teen Pop,1999


In [122]:
new_df = pd.DataFrame()
for genre in data["Playlist Genre"].unique():
    for year in data["Year"].unique():
        new = data.loc[(data["Year"] == year) & (data["Playlist Genre"] == genre)]
        data_subset=new.groupby(["Name", "Playlist Genre", "Year"]).mean()
        data_subset = data_subset.reset_index()
        top10=data_subset.nlargest(10,'Popularity')
        new_df = new_df.append(top10)

In [123]:
new_df.to_csv("top_songs.csv", sep = "\t", index = False)

In [120]:
new_df = pd.DataFrame()
for genre in data["Playlist Genre"].unique():
    for year in data["Year"].unique():
        new = data.loc[(data["Year"] == year) & (data["Playlist Genre"] == genre)]
        data_subset=new.groupby(["Artist", "Playlist Genre", "Year"]).mean()
        data_subset = data_subset.reset_index()
        top10=data_subset.nlargest(10,'Popularity')
        new_df = new_df.append(top10)

In [121]:
new_df.to_csv("top_artists.csv", sep = "\t", index = False)

In [18]:
(pd.read_csv('clean_spotify.csv', sep = '\t')).head()

Unnamed: 0,Name,Artist,Popularity,Album Name,Playlist Genre,Playlist Subgenre,Year
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,I Don't Care (with Justin Bieber) [Loud Luxury...,Pop,Dance Pop,2019
1,Memories - Dillon Francis Remix,Maroon 5,67,Memories (Dillon Francis Remix),Pop,Dance Pop,2019
2,All the Time - Don Diablo Remix,Zara Larsson,70,All the Time (Don Diablo Remix),Pop,Dance Pop,2019
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,Call You Mine - The Remixes,Pop,Dance Pop,2019
4,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,Someone You Loved (Future Humans Remix),Pop,Dance Pop,2019


In [14]:
data = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
test = data.to_csv('test.csv', index = False)

In [114]:
test = {"a":1}

In [115]:
test["a"]

1

In [145]:
counts = data.groupby(["Year", "Playlist Genre", "Playlist Subgenre"], as_index = False).size()
counts["Mean Popularity"] = (data.groupby(["Year", "Playlist Genre", "Playlist Subgenre"], as_index = False).mean("Popularity"))["Popularity"]
counts = counts.rename(columns={"size": "Number of Songs"})

In [146]:
counts

Unnamed: 0,Year,Playlist Genre,Playlist Subgenre,Number of Songs,Mean Popularity
0,1957,R&B,Urban Contemporary,1,59.000000
1,1957,Rock,Classic Rock,1,1.000000
2,1958,Rock,Classic Rock,1,73.000000
3,1960,R&B,Neo Soul,2,13.000000
4,1960,R&B,Urban Contemporary,2,19.000000
...,...,...,...,...,...
878,2020,Rap,Hip Hop,98,55.836735
879,2020,Rap,Trap,56,41.928571
880,2020,Rock,Classic Rock,3,50.666667
881,2020,Rock,Hard Rock,32,37.593750


In [147]:
counts.to_csv("genre_splits.csv", sep = "\t", index = False)

In [148]:
test = pd.read_csv("genre_splits.csv", sep = "\t")

In [150]:
type(test["Number of Songs"])

pandas.core.series.Series

In [152]:
topSongs = pd.read_csv('top_songs.csv', sep = '\t')
topArtists = pd.read_csv('top_artists.csv', sep = '\t')
topData = {"Name" : topSongs, "Artist": topArtists}

In [153]:
topSongs.head()

Unnamed: 0,Name,Playlist Genre,Year,Popularity
0,Blinding Lights,Pop,2019,98.0
1,Memories,Pop,2019,98.0
2,Tusa,Pop,2019,98.0
3,Don't Start Now,Pop,2019,97.0
4,everything i wanted,Pop,2019,97.0
