# Extracting Disliked Songs of a User using streaming history

In [267]:
# extracting the streaming history as JSON and save relevant columns as Pandas DF

import json
import pandas as pd

# Read the JSON file
with open('/Users/khieuvon/Documents/10_Personal Stuff/01_Masterarbeit/Data for ML Model/Collected Spotify Data from Friends/03_Data_Moni_ISFP/Spotify Account Data/StreamingHistory_music_0.json', 'r') as file:
    data = json.load(file)

# Extract the required fields
extracted_data = [
    {
        'artistName': item['artistName'],
        'trackName': item['trackName'],
        'msPlayed': item['msPlayed']
    }
    for item in data
]

# Create a pandas DataFrame
df = pd.DataFrame(extracted_data)

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file (optional)
# df.to_csv('streaming_history.csv', index=False)

            artistName                      trackName  msPlayed
0                 D.O.                    That's okay    221373
1         Jason Derulo  Vertigo (feat. Jordin Sparks)    233173
2  Panic! At The Disco              House of Memories    208706
3              Charice           Pyramid (feat. Iyaz)    187584
4              Charice           Pyramid (feat. Iyaz)      6122


In [268]:
df.head(10)

Unnamed: 0,artistName,trackName,msPlayed
0,D.O.,That's okay,221373
1,Jason Derulo,Vertigo (feat. Jordin Sparks),233173
2,Panic! At The Disco,House of Memories,208706
3,Charice,Pyramid (feat. Iyaz),187584
4,Charice,Pyramid (feat. Iyaz),6122
5,Ed Sheeran,Bad Habits,230746
6,Jason Derulo,The Other Side,226986
7,SLANDER,Love Is Gone,256042
8,Kurt Hugo Schneider,I Really Like You,191555
9,Jason Derulo,Vertigo (feat. Jordin Sparks),233173


In [269]:
df.shape

(3884, 3)

In [272]:
# creating a "like_dislike" column out of dataframe based on msPlayed (if <30000 then 0 / dislike else 1 / like)
df['like_dislike'] = (df['msPlayed'] > 30000).astype(int)

# Filter for disliked songs (like_dislike = 0)
df_dislike = df[df['like_dislike'] == 0]

# Filter for liked songs (like_dislike = 1)
df_like = df[df['like_dislike'] == 1]

# Remove entries where trackName is "Unknown Track"
df_dislike = df_dislike[df_dislike['trackName'] != "Unknown Track"]

# Drop duplicate values
df_dislike = df_dislike.drop_duplicates(subset='trackName', keep=False)

# getting a random sample of 200 entries out of the 
df_dislike_sampled = df_dislike.sample(n=200 , random_state=42)

In [273]:
df_dislike.shape

(116, 4)

In [274]:
df_dislike_sampled.shape

(116, 4)

In [282]:
df_dislike_sampled.duplicated().sum()

0

In [283]:
# getting Song URI based on the artistName and trackName

import pandas as pd
import requests
import base64

# Spotify API credentials
client_id = '582341de1c87493291783ae774754039'
client_secret = 'a21c97ce2062459a8f257d476ed5fe97'

# Function to get access token
def get_access_token(client_id, client_secret):
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_header = base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()
    auth_data = {'grant_type': 'client_credentials'}
    auth_response = requests.post(auth_url, headers={'Authorization': f'Basic {auth_header}'}, data=auth_data)
    return auth_response.json()['access_token']

# Function to search for a track and get its URI
def get_track_uri(artist_name, track_name, access_token):
    search_url = 'https://api.spotify.com/v1/search'
    query = f"track:{track_name} artist:{artist_name}"
    search_params = {
        'q': query,
        'type': 'track',
        'limit': 1
    }
    search_response = requests.get(search_url, headers={'Authorization': f'Bearer {access_token}'}, params=search_params)

    if search_response.status_code == 200:
        results = search_response.json()
        if results['tracks']['items']:
            return results['tracks']['items'][0]['uri']
    return None

# Get access token
access_token = get_access_token(client_id, client_secret)

# Function to apply to each row of the DataFrame
def get_uri_for_row(row):
    return get_track_uri(row['artistName'], row['trackName'], access_token)

# Apply the function to each row and create a new 'track_uri' column
df_dislike_sampled['track_uri'] = df_dislike_sampled.apply(get_uri_for_row, axis=1)

# Display the first few rows of the updated DataFrame
print(df_dislike_sampled[['artistName', 'trackName', 'track_uri']].head())

# Optionally, save the updated DataFrame to a CSV file
# df_dislike.to_csv('disliked_songs_with_uris.csv', index=False)

         artistName            trackName                             track_uri
2740   Mark Forster        Wir Sind Groß  spotify:track:7AraSdlKIOoPCLRKkI3WjP
308             陈令韬            输入法打可爱按第五  spotify:track:53zetdmdcvQnruSWhLTwFP
1241  Lewis Capaldi  How I'm Feeling Now  spotify:track:5AGS8VxsLyHQOWz0ZTuwIN
1112          LeeHi              BREATHE  spotify:track:6G4z9WbxyEeWdEQTfShACT
467     James Blunt                 1973  spotify:track:1BV0m40U0M4t1SLIsDnwZl


In [284]:
# dropping 2 columns
df_dislike_sampled_final = df_dislike_sampled.drop(['msPlayed', 'like_dislike'], axis=1)

In [285]:
# splitting track URI column to get track_id separately
df_dislike_sampled_final['track_id'] = df_dislike_sampled_final.iloc[:, 2].str.split(':').str.get(-1)

In [286]:
df_dislike_sampled_final.head(10)

Unnamed: 0,artistName,trackName,track_uri,track_id
2740,Mark Forster,Wir Sind Groß,spotify:track:7AraSdlKIOoPCLRKkI3WjP,7AraSdlKIOoPCLRKkI3WjP
308,陈令韬,输入法打可爱按第五,spotify:track:53zetdmdcvQnruSWhLTwFP,53zetdmdcvQnruSWhLTwFP
1241,Lewis Capaldi,How I'm Feeling Now,spotify:track:5AGS8VxsLyHQOWz0ZTuwIN,5AGS8VxsLyHQOWz0ZTuwIN
1112,LeeHi,BREATHE,spotify:track:6G4z9WbxyEeWdEQTfShACT,6G4z9WbxyEeWdEQTfShACT
467,James Blunt,1973,spotify:track:1BV0m40U0M4t1SLIsDnwZl,1BV0m40U0M4t1SLIsDnwZl
1296,Jason Derulo,Don't Wanna Go Home,spotify:track:7rGjz05rxd9MaXUAo7F4kb,7rGjz05rxd9MaXUAo7F4kb
3665,Wincent Weiss,1993,spotify:track:3E8qvYDyhKat3qGhmovygJ,3E8qvYDyhKat3qGhmovygJ
1022,Mark Forster,Drei Uhr Nachts,spotify:track:58GxgFEG5uHc9oMhofuuIP,58GxgFEG5uHc9oMhofuuIP
2377,Paloma Faith,Only Love Can Hurt Like This,spotify:track:62ke5zFUJN6RvtXZgVH0F8,62ke5zFUJN6RvtXZgVH0F8
472,BTS,Epilogue: Young Forever,spotify:track:6BskTMfDjXVzeqQHOau5Vi,6BskTMfDjXVzeqQHOau5Vi


In [287]:
# Drop duplicate values
df_dislike_sampled_final = df_dislike_sampled_final.drop_duplicates()

In [288]:
df_dislike_sampled_final.duplicated().sum()

0

In [289]:
# dropping null values in the track_uri column
df_dislike_sampled_final = df_dislike_sampled_final.dropna()

In [290]:
df_dislike_sampled_final.shape

(109, 4)

In [291]:
df_dislike_sampled_final.head(20)

Unnamed: 0,artistName,trackName,track_uri,track_id
2740,Mark Forster,Wir Sind Groß,spotify:track:7AraSdlKIOoPCLRKkI3WjP,7AraSdlKIOoPCLRKkI3WjP
308,陈令韬,输入法打可爱按第五,spotify:track:53zetdmdcvQnruSWhLTwFP,53zetdmdcvQnruSWhLTwFP
1241,Lewis Capaldi,How I'm Feeling Now,spotify:track:5AGS8VxsLyHQOWz0ZTuwIN,5AGS8VxsLyHQOWz0ZTuwIN
1112,LeeHi,BREATHE,spotify:track:6G4z9WbxyEeWdEQTfShACT,6G4z9WbxyEeWdEQTfShACT
467,James Blunt,1973,spotify:track:1BV0m40U0M4t1SLIsDnwZl,1BV0m40U0M4t1SLIsDnwZl
1296,Jason Derulo,Don't Wanna Go Home,spotify:track:7rGjz05rxd9MaXUAo7F4kb,7rGjz05rxd9MaXUAo7F4kb
3665,Wincent Weiss,1993,spotify:track:3E8qvYDyhKat3qGhmovygJ,3E8qvYDyhKat3qGhmovygJ
1022,Mark Forster,Drei Uhr Nachts,spotify:track:58GxgFEG5uHc9oMhofuuIP,58GxgFEG5uHc9oMhofuuIP
2377,Paloma Faith,Only Love Can Hurt Like This,spotify:track:62ke5zFUJN6RvtXZgVH0F8,62ke5zFUJN6RvtXZgVH0F8
472,BTS,Epilogue: Young Forever,spotify:track:6BskTMfDjXVzeqQHOau5Vi,6BskTMfDjXVzeqQHOau5Vi


In [292]:
df_dislike_sampled_final['track_id'].isnull().sum()

0

#### extracting audio_features via Spotify API

In [293]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_credentials_manager = SpotifyClientCredentials(client_id='582341de1c87493291783ae774754039', client_secret='a21c97ce2062459a8f257d476ed5fe97')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [294]:
import time

def get_audio_features_batch(track_ids, batch_size=100):
    audio_features = []
    for i in range(0, len(track_ids), batch_size):
        batch = track_ids[i:i+batch_size]
        features = sp.audio_features(batch)
        audio_features.extend(features)
        time.sleep(1)  # Add a 1-second delay between batches to respect rate limits
    return audio_features

In [295]:
all_track_ids = df_dislike_sampled_final['track_id'].tolist()
all_audio_features = get_audio_features_batch(all_track_ids)

In [296]:
audio_features_df = pd.DataFrame(all_audio_features)
result_df = pd.merge(df_dislike_sampled_final, audio_features_df, left_on='track_id', right_on='id', how='left')

In [297]:
result_df.shape

(109, 22)

In [298]:
result_df.drop_duplicates()

Unnamed: 0,artistName,trackName,track_uri,track_id,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Mark Forster,Wir Sind Groß,spotify:track:7AraSdlKIOoPCLRKkI3WjP,7AraSdlKIOoPCLRKkI3WjP,0.624,0.687,7,-6.403,0,0.1720,...,0.1120,0.407,86.038,audio_features,7AraSdlKIOoPCLRKkI3WjP,spotify:track:7AraSdlKIOoPCLRKkI3WjP,https://api.spotify.com/v1/tracks/7AraSdlKIOoP...,https://api.spotify.com/v1/audio-analysis/7Ara...,204760,4
1,陈令韬,输入法打可爱按第五,spotify:track:53zetdmdcvQnruSWhLTwFP,53zetdmdcvQnruSWhLTwFP,0.550,0.527,7,-7.682,1,0.0268,...,0.1180,0.330,143.910,audio_features,53zetdmdcvQnruSWhLTwFP,spotify:track:53zetdmdcvQnruSWhLTwFP,https://api.spotify.com/v1/tracks/53zetdmdcvQn...,https://api.spotify.com/v1/audio-analysis/53ze...,229686,4
2,Lewis Capaldi,How I'm Feeling Now,spotify:track:5AGS8VxsLyHQOWz0ZTuwIN,5AGS8VxsLyHQOWz0ZTuwIN,0.685,0.435,0,-3.979,1,0.0277,...,0.1900,0.398,102.910,audio_features,5AGS8VxsLyHQOWz0ZTuwIN,spotify:track:5AGS8VxsLyHQOWz0ZTuwIN,https://api.spotify.com/v1/tracks/5AGS8VxsLyHQ...,https://api.spotify.com/v1/audio-analysis/5AGS...,226440,4
3,LeeHi,BREATHE,spotify:track:6G4z9WbxyEeWdEQTfShACT,6G4z9WbxyEeWdEQTfShACT,0.616,0.251,8,-8.454,1,0.0380,...,0.0790,0.356,123.802,audio_features,6G4z9WbxyEeWdEQTfShACT,spotify:track:6G4z9WbxyEeWdEQTfShACT,https://api.spotify.com/v1/tracks/6G4z9WbxyEeW...,https://api.spotify.com/v1/audio-analysis/6G4z...,288915,4
4,James Blunt,1973,spotify:track:1BV0m40U0M4t1SLIsDnwZl,1BV0m40U0M4t1SLIsDnwZl,0.720,0.668,9,-7.928,1,0.0269,...,0.0789,0.769,123.005,audio_features,1BV0m40U0M4t1SLIsDnwZl,spotify:track:1BV0m40U0M4t1SLIsDnwZl,https://api.spotify.com/v1/tracks/1BV0m40U0M4t...,https://api.spotify.com/v1/audio-analysis/1BV0...,280027,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,Katy Perry,Unconditionally,spotify:track:009ImBOrIUlWgla8U05RAC,009ImBOrIUlWgla8U05RAC,0.432,0.725,7,-4.862,1,0.0431,...,0.2080,0.353,128.902,audio_features,009ImBOrIUlWgla8U05RAC,spotify:track:009ImBOrIUlWgla8U05RAC,https://api.spotify.com/v1/tracks/009ImBOrIUlW...,https://api.spotify.com/v1/audio-analysis/009I...,228879,4
105,SEVENTEEN,Left & Right,spotify:track:065yxZRBAsenRLZacB1uc2,065yxZRBAsenRLZacB1uc2,0.725,0.823,6,-3.868,0,0.1150,...,0.1040,0.958,149.994,audio_features,065yxZRBAsenRLZacB1uc2,spotify:track:065yxZRBAsenRLZacB1uc2,https://api.spotify.com/v1/tracks/065yxZRBAsen...,https://api.spotify.com/v1/audio-analysis/065y...,201773,4
106,Tyler Ward,I Don't Wanna Live Forever (Fifty Shades Darker),spotify:track:1etANXelRasvdwGDDj0reR,1etANXelRasvdwGDDj0reR,0.787,0.403,0,-11.989,1,0.0330,...,0.1310,0.264,118.043,audio_features,1etANXelRasvdwGDDj0reR,spotify:track:1etANXelRasvdwGDDj0reR,https://api.spotify.com/v1/tracks/1etANXelRasv...,https://api.spotify.com/v1/audio-analysis/1etA...,222352,4
107,Train,Drops of Jupiter (Tell Me),spotify:track:2hKdd3qO7cWr2Jo0Bcs0MA,2hKdd3qO7cWr2Jo0Bcs0MA,0.481,0.638,0,-5.862,1,0.0276,...,0.1540,0.497,79.064,audio_features,2hKdd3qO7cWr2Jo0Bcs0MA,spotify:track:2hKdd3qO7cWr2Jo0Bcs0MA,https://api.spotify.com/v1/tracks/2hKdd3qO7cWr...,https://api.spotify.com/v1/audio-analysis/2hKd...,259933,4


In [299]:
result_df.head(20)

Unnamed: 0,artistName,trackName,track_uri,track_id,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Mark Forster,Wir Sind Groß,spotify:track:7AraSdlKIOoPCLRKkI3WjP,7AraSdlKIOoPCLRKkI3WjP,0.624,0.687,7,-6.403,0,0.172,...,0.112,0.407,86.038,audio_features,7AraSdlKIOoPCLRKkI3WjP,spotify:track:7AraSdlKIOoPCLRKkI3WjP,https://api.spotify.com/v1/tracks/7AraSdlKIOoP...,https://api.spotify.com/v1/audio-analysis/7Ara...,204760,4
1,陈令韬,输入法打可爱按第五,spotify:track:53zetdmdcvQnruSWhLTwFP,53zetdmdcvQnruSWhLTwFP,0.55,0.527,7,-7.682,1,0.0268,...,0.118,0.33,143.91,audio_features,53zetdmdcvQnruSWhLTwFP,spotify:track:53zetdmdcvQnruSWhLTwFP,https://api.spotify.com/v1/tracks/53zetdmdcvQn...,https://api.spotify.com/v1/audio-analysis/53ze...,229686,4
2,Lewis Capaldi,How I'm Feeling Now,spotify:track:5AGS8VxsLyHQOWz0ZTuwIN,5AGS8VxsLyHQOWz0ZTuwIN,0.685,0.435,0,-3.979,1,0.0277,...,0.19,0.398,102.91,audio_features,5AGS8VxsLyHQOWz0ZTuwIN,spotify:track:5AGS8VxsLyHQOWz0ZTuwIN,https://api.spotify.com/v1/tracks/5AGS8VxsLyHQ...,https://api.spotify.com/v1/audio-analysis/5AGS...,226440,4
3,LeeHi,BREATHE,spotify:track:6G4z9WbxyEeWdEQTfShACT,6G4z9WbxyEeWdEQTfShACT,0.616,0.251,8,-8.454,1,0.038,...,0.079,0.356,123.802,audio_features,6G4z9WbxyEeWdEQTfShACT,spotify:track:6G4z9WbxyEeWdEQTfShACT,https://api.spotify.com/v1/tracks/6G4z9WbxyEeW...,https://api.spotify.com/v1/audio-analysis/6G4z...,288915,4
4,James Blunt,1973,spotify:track:1BV0m40U0M4t1SLIsDnwZl,1BV0m40U0M4t1SLIsDnwZl,0.72,0.668,9,-7.928,1,0.0269,...,0.0789,0.769,123.005,audio_features,1BV0m40U0M4t1SLIsDnwZl,spotify:track:1BV0m40U0M4t1SLIsDnwZl,https://api.spotify.com/v1/tracks/1BV0m40U0M4t...,https://api.spotify.com/v1/audio-analysis/1BV0...,280027,4
5,Jason Derulo,Don't Wanna Go Home,spotify:track:7rGjz05rxd9MaXUAo7F4kb,7rGjz05rxd9MaXUAo7F4kb,0.623,0.874,2,-4.296,0,0.0506,...,0.179,0.682,127.992,audio_features,7rGjz05rxd9MaXUAo7F4kb,spotify:track:7rGjz05rxd9MaXUAo7F4kb,https://api.spotify.com/v1/tracks/7rGjz05rxd9M...,https://api.spotify.com/v1/audio-analysis/7rGj...,244893,4
6,Wincent Weiss,1993,spotify:track:3E8qvYDyhKat3qGhmovygJ,3E8qvYDyhKat3qGhmovygJ,0.553,0.315,0,-10.082,1,0.0329,...,0.107,0.116,92.717,audio_features,3E8qvYDyhKat3qGhmovygJ,spotify:track:3E8qvYDyhKat3qGhmovygJ,https://api.spotify.com/v1/tracks/3E8qvYDyhKat...,https://api.spotify.com/v1/audio-analysis/3E8q...,192147,4
7,Mark Forster,Drei Uhr Nachts,spotify:track:58GxgFEG5uHc9oMhofuuIP,58GxgFEG5uHc9oMhofuuIP,0.663,0.673,8,-7.031,1,0.119,...,0.191,0.759,164.817,audio_features,58GxgFEG5uHc9oMhofuuIP,spotify:track:58GxgFEG5uHc9oMhofuuIP,https://api.spotify.com/v1/tracks/58GxgFEG5uHc...,https://api.spotify.com/v1/audio-analysis/58Gx...,161001,4
8,Paloma Faith,Only Love Can Hurt Like This,spotify:track:62ke5zFUJN6RvtXZgVH0F8,62ke5zFUJN6RvtXZgVH0F8,0.566,0.885,8,-4.528,1,0.0818,...,0.334,0.304,90.99,audio_features,62ke5zFUJN6RvtXZgVH0F8,spotify:track:62ke5zFUJN6RvtXZgVH0F8,https://api.spotify.com/v1/tracks/62ke5zFUJN6R...,https://api.spotify.com/v1/audio-analysis/62ke...,232893,4
9,BTS,Epilogue: Young Forever,spotify:track:6BskTMfDjXVzeqQHOau5Vi,6BskTMfDjXVzeqQHOau5Vi,0.606,0.815,5,-3.818,0,0.232,...,0.119,0.386,88.054,audio_features,6BskTMfDjXVzeqQHOau5Vi,spotify:track:6BskTMfDjXVzeqQHOau5Vi,https://api.spotify.com/v1/tracks/6BskTMfDjXVz...,https://api.spotify.com/v1/audio-analysis/6Bsk...,171803,4


In [300]:
result_df['MBTI'] = 'ENFP' # --> change as per MBTI type

In [301]:
# Save the extended DataFrame
result_df.to_csv('extended_songs_yoojin_dislike.csv', index=False) # --> change as per Name