#### No need to run the following section if you have run the following code in your terminal:

```
pip install -r final-project/requirements.txt
```



In [9]:
pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Collection

we obtained the raw data for our project by collect data from a [Spotify-made "Indie Pop Hits" playlist](https://open.spotify.com/playlist/37i9dQZF1DXbO6rt3GhXDY?si=4c33da4bfe5c4b73) using available spotify API and library, as well as web-scraping from website : 'https://www.indieshuffle.com/songs/indie-pop/' using Python libraries like BeautifulSoup and requests

Due to the dynamic changes in the music industry, we chose to scrape data from dynamically generated pages. To get the most up-to-date collection of songs, you can run the following code. For the purpose of this project, please consider using files provided in the data folder, in the case of inconsistent result for the following sections (Cleaning, Analysis, Visualization).

In [10]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import requests
from requests.auth import HTTPBasicAuth

In [11]:
cid = 'c27327d3d3b44e06952699bd3024e1dd'
secret = '5b85ba25bdf2400199c183932bb036e2'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [12]:
import json
import pandas as pd

### function(s) for both groups

In [13]:
# creating dataframe with audio features with list of tracks:
def create_audio_feature_df(tracks):
  track_ids = [track['id'] for track in tracks]
  audio_features = [sp.audio_features(track_id)[0] for track_id in track_ids]
  return pd.DataFrame(audio_features)

### GROUP 1 Spotify-made Playlist
unpersonalized playlist ("Indie Pop Hits")

n=100

In [14]:
#Retrieve the playlist: "Indie Pop Hits" using its uri
playlist_tracks = sp.playlist_tracks("37i9dQZF1DXbO6rt3GhXDY")

In [15]:
# creating track database
g1_tracks = []
for item in playlist_tracks['items']:
  g1_tracks.append(item['track'])

group1_df = pd.DataFrame(g1_tracks)

In [16]:
# creating audio features df using list of track_ids
g1_audio_df = create_audio_feature_df(g1_tracks)
g1_audio_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.764,0.705,3,-5.279,0,0.0278,0.0371,1.9e-05,0.0943,0.672,101.003,audio_features,0y60itmpH0aPKsFiGxmtnh,spotify:track:0y60itmpH0aPKsFiGxmtnh,https://api.spotify.com/v1/tracks/0y60itmpH0aP...,https://api.spotify.com/v1/audio-analysis/0y60...,196520,4
1,0.603,0.784,6,-4.023,1,0.062,0.446,8e-06,0.119,0.769,172.041,audio_features,3EaJDYHA0KnX88JvDhL9oa,spotify:track:3EaJDYHA0KnX88JvDhL9oa,https://api.spotify.com/v1/tracks/3EaJDYHA0KnX...,https://api.spotify.com/v1/audio-analysis/3EaJ...,173104,4
2,0.744,0.619,0,-9.805,1,0.039,0.598,0.00372,0.231,0.641,112.997,audio_features,7B3z0ySL9Rr0XvZEAjWZzM,spotify:track:7B3z0ySL9Rr0XvZEAjWZzM,https://api.spotify.com/v1/tracks/7B3z0ySL9Rr0...,https://api.spotify.com/v1/audio-analysis/7B3z...,188387,4
3,0.815,0.518,7,-6.594,0,0.0897,0.223,0.0,0.104,0.877,151.891,audio_features,0uI7yAKUf52Cn7y3sYyjiX,spotify:track:0uI7yAKUf52Cn7y3sYyjiX,https://api.spotify.com/v1/tracks/0uI7yAKUf52C...,https://api.spotify.com/v1/audio-analysis/0uI7...,177667,4
4,0.863,0.631,7,-4.689,1,0.0534,0.305,3e-05,0.123,0.817,128.977,audio_features,4nK5YrxbMGZstTLbvj6Gxw,spotify:track:4nK5YrxbMGZstTLbvj6Gxw,https://api.spotify.com/v1/tracks/4nK5YrxbMGZs...,https://api.spotify.com/v1/audio-analysis/4nK5...,223480,4


In [17]:
# combining track df and track audio feature df
group1_combined = group1_df.merge(g1_audio_df,on="uri")
group1_combined.columns

Index(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms_x',
       'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id_x',
       'is_local', 'name', 'popularity', 'preview_url', 'track',
       'track_number', 'type_x', 'uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'type_y', 'id_y', 'track_href',
       'analysis_url', 'duration_ms_y', 'time_signature'],
      dtype='object')

In [19]:
# exporting
group1_combined.to_csv('../data/raw/new_group1_raw.csv')

### GROUP 2 WEB SCRAPE
songs featured on indieshuffle.com

n = 133

In [None]:
from bs4 import BeautifulSoup
import requests
import html
import re

In [None]:
base_url = 'https://www.indieshuffle.com/songs/indie-pop/'

In [None]:
# function to extract song info in html format info into tuple
def extract_song_info(text):
  text = " ".join(line.strip() for line in text.splitlines())
  text = html.unescape(text)
  extract = "<h5>.*<strong>(.*)</strong>.*</span>(.*)</h5>"
  return re.findall(extract, text)

In [None]:
# extracting songs from the first n pages using beautiful soup into list of tuples
n = 26
song_elements = []
for n in range(1,n):
  url = f'{base_url}page/+{n}'
  res = requests.get(url)
  soup = BeautifulSoup(res.text)
  song_elements += soup.find_all('h5')

song_tuple_list = []
for element in song_elements:
  song_tuple_list += extract_song_info(str(element))

len(song_tuple_list)

200

In [None]:
g2_list = []
for artist_name, song_name in song_tuple_list:
    track_dict = sp.search(q='artist:' + artist_name + ', album:' + song_name, type="track", limit=1)
    g2_list.append(track_dict['tracks'])

In [None]:
g2_tracks=[]
n=0
for element in g2_list:
  try:
    g2_tracks.append(element['items'][0])
  except IndexError:
    n+=1
print(f'{n} tracks not found.')

68 tracks not found.


In [None]:
# creating audio features df using list of track_ids
g2_audio_df = create_audio_feature_df(g2_tracks)
g2_audio_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.766,0.335,6,-12.63,1,0.0306,0.461,0.0283,0.106,0.322,100.007,audio_features,0Yu6DQYBpvztT117wT37aR,spotify:track:0Yu6DQYBpvztT117wT37aR,https://api.spotify.com/v1/tracks/0Yu6DQYBpvzt...,https://api.spotify.com/v1/audio-analysis/0Yu6...,267125,4
1,0.668,0.475,7,-7.663,1,0.0325,0.442,0.000337,0.0997,0.586,111.993,audio_features,0HvbFmfFlcsAJ6hydqflNk,spotify:track:0HvbFmfFlcsAJ6hydqflNk,https://api.spotify.com/v1/tracks/0HvbFmfFlcsA...,https://api.spotify.com/v1/audio-analysis/0Hvb...,190148,4
2,0.492,0.601,8,-9.432,1,0.036,0.593,0.436,0.107,0.444,147.914,audio_features,3BqpakuX85Wtw8DI1yUr4q,spotify:track:3BqpakuX85Wtw8DI1yUr4q,https://api.spotify.com/v1/tracks/3BqpakuX85Wt...,https://api.spotify.com/v1/audio-analysis/3Bqp...,208658,4
3,0.694,0.676,5,-6.631,1,0.035,0.495,2e-06,0.356,0.666,96.085,audio_features,4XyTTV4598arHa8KRcgPPF,spotify:track:4XyTTV4598arHa8KRcgPPF,https://api.spotify.com/v1/tracks/4XyTTV4598ar...,https://api.spotify.com/v1/audio-analysis/4XyT...,124120,4
4,0.829,0.59,1,-9.05,1,0.314,0.063,0.0218,0.0609,0.804,98.05,audio_features,46OGrRCC0Y7ZyF0qvXsPNt,spotify:track:46OGrRCC0Y7ZyF0qvXsPNt,https://api.spotify.com/v1/tracks/46OGrRCC0Y7Z...,https://api.spotify.com/v1/audio-analysis/46OG...,195918,4


In [None]:
# combining track df and track audio feature df
group2_combined = group2_df.merge(g2_audio_df,on="uri")
group2_combined.columns

Index(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms_x',
       'explicit', 'external_ids', 'external_urls', 'href', 'id_x', 'is_local',
       'name', 'popularity', 'preview_url', 'track_number', 'type_x', 'uri',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type_y', 'id_y', 'track_href', 'analysis_url', 'duration_ms_y',
       'time_signature'],
      dtype='object')

In [None]:
# exporting
group2_combined.to_csv('../data/raw/new_group2_raw.csv')

### GROUP 3 JINZY's songs
n=7

In [None]:
# JINZY (URI: 7aCnUh8mmQKLeFl3bTtfFl) information
jinzy_top_tracks = sp.artist_top_tracks("7aCnUh8mmQKLeFl3bTtfFl")
group3_df = pd.DataFrame.from_dict(jinzy_top_tracks['tracks'])
group3_df

Unnamed: 0,album,artists,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,is_playable,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,212653,True,{'isrc': 'QZMEN2027923'},{'spotify': 'https://open.spotify.com/track/2k...,https://api.spotify.com/v1/tracks/2kktCN1EnBf2...,2kktCN1EnBf2dvIm5ZkdI3,False,True,I Left the City That Night,27,https://p.scdn.co/mp3-preview/b9cd53b59caab09f...,1,track,spotify:track:2kktCN1EnBf2dvIm5ZkdI3
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,184277,False,{'isrc': 'QZK6H2278643'},{'spotify': 'https://open.spotify.com/track/4G...,https://api.spotify.com/v1/tracks/4GK4ozFpzroP...,4GK4ozFpzroPxiUdE0k9yJ,False,True,The Show,2,https://p.scdn.co/mp3-preview/8cbc7a255d3011db...,1,track,spotify:track:4GK4ozFpzroPxiUdE0k9yJ
2,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,195716,True,{'isrc': 'QZNWY2265350'},{'spotify': 'https://open.spotify.com/track/5H...,https://api.spotify.com/v1/tracks/5HhLzu81uxhW...,5HhLzu81uxhWiyhvC0dHEO,False,True,5 o'clock somewhere,1,https://p.scdn.co/mp3-preview/2a53f26e477f92dc...,1,track,spotify:track:5HhLzu81uxhWiyhvC0dHEO
3,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,241668,False,{'isrc': 'QZMEM2216784'},{'spotify': 'https://open.spotify.com/track/6l...,https://api.spotify.com/v1/tracks/6lmOTdlIGSSE...,6lmOTdlIGSSEd55GLML2lZ,False,True,Firework,1,https://p.scdn.co/mp3-preview/319de7760ed18db7...,1,track,spotify:track:6lmOTdlIGSSEd55GLML2lZ
4,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,244305,False,{'isrc': 'QZK6G2278289'},{'spotify': 'https://open.spotify.com/track/2e...,https://api.spotify.com/v1/tracks/2ep2fSZcVZxd...,2ep2fSZcVZxdOTalVbXbCa,False,True,Turing's Secret,0,https://p.scdn.co/mp3-preview/6a56bfb8ec37f287...,1,track,spotify:track:2ep2fSZcVZxdOTalVbXbCa
5,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,239399,True,{'isrc': 'QZMEP2167730'},{'spotify': 'https://open.spotify.com/track/7a...,https://api.spotify.com/v1/tracks/7aAqZRdwLXzW...,7aAqZRdwLXzW1LejPLC79I,False,True,Sin City,0,https://p.scdn.co/mp3-preview/27115fd597018560...,1,track,spotify:track:7aAqZRdwLXzW1LejPLC79I
6,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,194857,False,{'isrc': 'QZMEM2216786'},{'spotify': 'https://open.spotify.com/track/5D...,https://api.spotify.com/v1/tracks/5DF1RA2Grkga...,5DF1RA2Grkga0mXk3WPStH,False,True,Joi,0,https://p.scdn.co/mp3-preview/0565f3f289eaa120...,3,track,spotify:track:5DF1RA2Grkga0mXk3WPStH
7,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,190870,False,{'isrc': 'QZMEM2216785'},{'spotify': 'https://open.spotify.com/track/68...,https://api.spotify.com/v1/tracks/68avRHl0QLfN...,68avRHl0QLfNwBUtZH8Aao,False,True,Forgotten,0,https://p.scdn.co/mp3-preview/0f1538a911d7b800...,2,track,spotify:track:68avRHl0QLfNwBUtZH8Aao


In [None]:
# getting audio feature from function (alt to previous cell)
g3_tracks = []
for item in jinzy_top_tracks['tracks']:
  g3_tracks.append(item)

g3_audio_df = create_audio_feature_df(g3_tracks)
g3_audio_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.525,0.575,4,-7.785,0,0.047,0.463,0.00185,0.0892,0.0739,158.084,audio_features,2kktCN1EnBf2dvIm5ZkdI3,spotify:track:2kktCN1EnBf2dvIm5ZkdI3,https://api.spotify.com/v1/tracks/2kktCN1EnBf2...,https://api.spotify.com/v1/audio-analysis/2kkt...,212654,4
1,0.696,0.485,4,-5.913,0,0.0788,0.286,0.0,0.321,0.512,171.984,audio_features,4GK4ozFpzroPxiUdE0k9yJ,spotify:track:4GK4ozFpzroPxiUdE0k9yJ,https://api.spotify.com/v1/tracks/4GK4ozFpzroP...,https://api.spotify.com/v1/audio-analysis/4GK4...,184277,4
2,0.798,0.559,11,-5.729,1,0.0343,0.561,0.0,0.13,0.875,119.998,audio_features,5HhLzu81uxhWiyhvC0dHEO,spotify:track:5HhLzu81uxhWiyhvC0dHEO,https://api.spotify.com/v1/tracks/5HhLzu81uxhW...,https://api.spotify.com/v1/audio-analysis/5HhL...,195717,4
3,0.743,0.435,1,-11.53,1,0.0411,0.0522,0.000996,0.0755,0.0843,111.892,audio_features,6lmOTdlIGSSEd55GLML2lZ,spotify:track:6lmOTdlIGSSEd55GLML2lZ,https://api.spotify.com/v1/tracks/6lmOTdlIGSSE...,https://api.spotify.com/v1/audio-analysis/6lmO...,241668,4
4,0.44,0.516,5,-6.384,0,0.071,0.48,0.0,0.0996,0.169,148.005,audio_features,2ep2fSZcVZxdOTalVbXbCa,spotify:track:2ep2fSZcVZxdOTalVbXbCa,https://api.spotify.com/v1/tracks/2ep2fSZcVZxd...,https://api.spotify.com/v1/audio-analysis/2ep2...,244305,4


In [None]:
group3_combined = group3_df.merge(g3_audio_df,on="uri")
group3_combined.head()

Unnamed: 0,album,artists,disc_number,duration_ms_x,explicit,external_ids,external_urls,href,id_x,is_local,...,instrumentalness,liveness,valence,tempo,type_y,id_y,track_href,analysis_url,duration_ms_y,time_signature
0,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,212653,True,{'isrc': 'QZMEN2027923'},{'spotify': 'https://open.spotify.com/track/2k...,https://api.spotify.com/v1/tracks/2kktCN1EnBf2...,2kktCN1EnBf2dvIm5ZkdI3,False,...,0.00185,0.0892,0.0739,158.084,audio_features,2kktCN1EnBf2dvIm5ZkdI3,https://api.spotify.com/v1/tracks/2kktCN1EnBf2...,https://api.spotify.com/v1/audio-analysis/2kkt...,212654,4
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,184277,False,{'isrc': 'QZK6H2278643'},{'spotify': 'https://open.spotify.com/track/4G...,https://api.spotify.com/v1/tracks/4GK4ozFpzroP...,4GK4ozFpzroPxiUdE0k9yJ,False,...,0.0,0.321,0.512,171.984,audio_features,4GK4ozFpzroPxiUdE0k9yJ,https://api.spotify.com/v1/tracks/4GK4ozFpzroP...,https://api.spotify.com/v1/audio-analysis/4GK4...,184277,4
2,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,195716,True,{'isrc': 'QZNWY2265350'},{'spotify': 'https://open.spotify.com/track/5H...,https://api.spotify.com/v1/tracks/5HhLzu81uxhW...,5HhLzu81uxhWiyhvC0dHEO,False,...,0.0,0.13,0.875,119.998,audio_features,5HhLzu81uxhWiyhvC0dHEO,https://api.spotify.com/v1/tracks/5HhLzu81uxhW...,https://api.spotify.com/v1/audio-analysis/5HhL...,195717,4
3,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,241668,False,{'isrc': 'QZMEM2216784'},{'spotify': 'https://open.spotify.com/track/6l...,https://api.spotify.com/v1/tracks/6lmOTdlIGSSE...,6lmOTdlIGSSEd55GLML2lZ,False,...,0.000996,0.0755,0.0843,111.892,audio_features,6lmOTdlIGSSEd55GLML2lZ,https://api.spotify.com/v1/tracks/6lmOTdlIGSSE...,https://api.spotify.com/v1/audio-analysis/6lmO...,241668,4
4,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,1,244305,False,{'isrc': 'QZK6G2278289'},{'spotify': 'https://open.spotify.com/track/2e...,https://api.spotify.com/v1/tracks/2ep2fSZcVZxd...,2ep2fSZcVZxdOTalVbXbCa,False,...,0.0,0.0996,0.169,148.005,audio_features,2ep2fSZcVZxdOTalVbXbCa,https://api.spotify.com/v1/tracks/2ep2fSZcVZxd...,https://api.spotify.com/v1/audio-analysis/2ep2...,244305,4


In [None]:
group3_combined.to_csv('../data/raw/new_group3_raw.csv')