# AWS Spotify Project
December 20, 2023

## Introduction
This Jupyter Notebook serves as a baseline for extracting data from the Spotify API.

## Contact Information
For any inquiries or suggestions, feel free to reach out:
- Author: Tolgahan Cepel
- Email: tolgahan.cepel@gmail.com

In [None]:
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import boto3
from datetime import datetime
import json
from io import StringIO

### Retrieve data from Spotify API

In [None]:
client_id = "<YOUR_CLIENT_ID>"
client_secret = "<YOUR_CLIENT_SECRET>"

client_credential_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credential_manager)

playlist_link = 'https://open.spotify.com/playlist/37i9dQZEVXbIVYVBNw9D5K'
playlist_id = playlist_link.split('/')[-1]

data = sp.playlist_tracks(playlist_id)

## Transformation: JSON to Tabular

### Albums
```
items/
    track/
        album/
            id
            name
            release_date
            total_tracks
            external_urls/
                spotify
```

In [None]:
print(data["items"][0]["track"]["album"]["id"])
print(data["items"][0]["track"]["album"]["name"])
print(data["items"][0]["track"]["album"]["release_date"])
print(data["items"][0]["track"]["album"]["total_tracks"])
print(data["items"][0]["track"]["album"]["external_urls"]["spotify"])

In [None]:
def extract_albums(json_data):
    albums_list = []
    for row in data["items"]:
        album_dict = {
            "album_id": row["track"]["album"]["id"],
            "album_name": row["track"]["album"]["name"],
            "release_date": row["track"]["album"]["release_date"],
            "total_tracks": row["track"]["album"]["total_tracks"],
            "spotify_url": row["track"]["album"]["external_urls"]["spotify"],
        }
        albums_list.append(album_dict)
    return albums_list

In [None]:
# extract_albums(data)

### Artists
```
items/
    track/
        album/
            artist
```

In [None]:
print(data["items"][0]["track"]["album"]["artists"][0]["id"])
print(data["items"][0]["track"]["album"]["artists"][0]["name"])
print(data["items"][0]["track"]["album"]["artists"][0]["type"])
print(data["items"][0]["track"]["album"]["artists"][0]["external_urls"]["spotify"])

In [None]:
def extract_artists(json_data):
    artists_list = []
    for row in data["items"]:
        for artist in row["track"]["album"]["artists"]:
            album_dict = {
                "artist_id": artist["id"],
                "artist_name": artist["name"],
                "artist_type": artist["type"],
                "spotify_url": artist["external_urls"]["spotify"],
            }
        artists_list.append(album_dict)
    return artists_list

In [None]:
# extract_artists(data)

### Songs
```
items/
    added_at
    track/
        id
        album/
            id
        artists/
            []/
                id
        name
        duration_ms
        popularity
        external_urls/
            spotify
```

In [None]:
print(data["items"][0]["track"]["id"])
print(data["items"][0]["track"]["album"]["id"])
print(data["items"][0]["track"]["album"]["artists"][0]["id"])
print(data["items"][0]["track"]["name"])
print(data["items"][0]["track"]["duration_ms"])
print(data["items"][0]["added_at"])
print(data["items"][0]["track"]["popularity"])
print(data["items"][0]["track"]["external_urls"]["spotify"])

In [None]:
def extract_songs(json_data):
    songs_list = []
    for row in data["items"]:
        song_dict = {
            "song_id": row["track"]["id"],
            "album_id": row["track"]["album"]["id"],
            "song_name": row["track"]["name"],
            "duration_ms": row["track"]["duration_ms"],
            "added_at": row["added_at"],
            "popularity": row["track"]["popularity"],
            "spotify_url": row["track"]["external_urls"]["spotify"]
        }
        songs_list.append(song_dict)
    return songs_list
    

## Transform

In [None]:
aws_access_key_id = 'AKIAQWOZDBBP5R5LVL5L'
aws_secret_access_key = 'Jsp1H5u8nDJ/LDNb6xNw5KNeUO+BYUKtFhIsIaCZ'
aws_region = 'eu-central-1'  # For example, 'us-west-2'

# Create a Boto3 session with your credentials
session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)


In [None]:
client = session.client('s3')
Bucket = 'spotify-etl-pipeline-cepel'
Key = 'raw_data/'

In [None]:
spotify_data = []
spotify_keys = []

for file in client.list_objects(Bucket=Bucket, Prefix=Key)["Contents"]:
    file_key = file['Key']
    
    if file_key.split('.')[-1] == "json":
        response = client.get_object(Bucket=Bucket, Key=file_key)
        content = response["Body"]
        jsonObject = json.loads(content.read())
        
        spotify_data.append(jsonObject)
        spotify_keys.append(file_key)

In [None]:
for data in spotify_data:
    album_list = extract_albums(data)
    artist_list = extract_artists(data)
    song_list = extract_songs(data)

    # Converting each list into a dataframe
    album_df = pd.DataFrame(album_list)
    artist_df = pd.DataFrame(artist_list)
    song_df = pd.DataFrame(song_list)

    # Droping possible duplicates
    album_df = album_df.drop_duplicates(subset=['album_id'])
    artist_df = artist_df.drop_duplicates(subset=['artist_id'])
    song_df = song_df.drop_duplicates(subset=['song_id'])
    
    album_df["album_release_date"] = pd.to_datetime(album_df["release_date"], format='%Y-%m-%d')
    song_df['song_added_at'] = pd.to_datetime(album_df["release_date"], format='%Y-%m-%d')

In [None]:
# Converting each dataframe into a csv file
# Album
album_key = "album_data/album_transformed_" +  datetime.now().strftime("%Y%m%d") + ".csv"
album_buffer = StringIO()
album_df.to_csv(album_buffer, index=False)
album_content = album_buffer.getvalue()

In [None]:
# Artist
artist_key = "artist_data/artist_transformed_" +  datetime.now().strftime("%Y%m%d") + ".csv"
artist_buffer = StringIO()
artist_df.to_csv(artist_buffer, index=False)
artist_content = artist_buffer.getvalue()

# Song
song_key = "song_data/song_transformed_" +  datetime.now().strftime("%Y%m%d") + ".csv"
song_buffer = StringIO()
song_df.to_csv(song_buffer, index=False)
song_content = song_buffer.getvalue()


In [None]:
client.put_object(
    Bucket='spotify-etl-pipeline-cepel',
    Key = 'processing/' + album_key,
    Body = json.dumps(album_content)
)

client.put_object(
    Bucket='spotify-etl-pipeline-cepel',
    Key = 'processing/' + artist_key,
    Body = json.dumps(artist_content)
)

client.put_object(
    Bucket='spotify-etl-pipeline-cepel',
    Key = 'processing/' + song_key,
    Body = json.dumps(song_content)
)