# DEDS Assignment 2

Author: Vincent Itucal

In [13]:
import boto3
import uuid
import random
import pandas as pd
from faker import Faker
from boto3.dynamodb.conditions import Key

## Description of use case and justification for the use of a NoSQL database
Entities Stored in NoSQL
- `LabelObject` - partition key is `label_id` (e.g. `LABEL#1234`) sort key is `METADATA`
- `ArtistObject`- partition key is `artist_id` (e.g. `ARTIST#1234`) sort key is `METADATA`
- `SongObject` - with columns: 
  - `song_id` - (e.g. `SONG#1234`)
  - `label_id`
  - `artist_id`
  - `song_title`
  - `label_name`
  - `artist_name`
  - `play_count`
  - `sold_amount_in_php`

Access patterns:
- Get all songs under a label
- Get all songs by artist
- Get top listened songs by artist or label
- Get top selling songs by artist or label

## Python code for adding the items

In [3]:
fake = Faker()
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table_name = "nosql_music_stream_service"
table = dynamodb.Table(table_name)

In [10]:
# Generate Labels
labels = []
for i in range(3):
    label_id = f"LABEL#{i+1}"
    labels.append({"pk": label_id, "sk": "METADATA", "label_name": fake.company()})

# Generate Artists
artists = []
for i in range(17):
    label = random.choice(labels)
    artist_id = f"ARTIST#{i+1}"
    artists.append({"pk": artist_id, "sk": "METADATA", "artist_name": fake.name(), "label_id": label["pk"]})

# Generate Songs
songs = []
for i in range(80):
    artist = random.choice(artists)
    label = next(l for l in labels if l["pk"] == artist["label_id"])
    song_id = f"SONG#{i+1}"
    song = {
        "pk": song_id,
        "sk": "METADATA",
        "song_title": fake.sentence(nb_words=3),
        "artist_id": artist["pk"],
        "artist_name": artist["artist_name"],
        "label_id": label["pk"],
        "label_name": label["label_name"],
        "play_count": random.randint(1000, 100000),
        "sold_amount_in_php": random.randint(50000, 1000000)
    }
    songs.append(song)

In [11]:
for label in labels:
    table.put_item(Item=label)
for artist in artists:
    table.put_item(Item=artist)
for song in songs:
    table.put_item(Item=song)

##  Screenshot of dynamodb table items. The table should have at least 100 items.

In [15]:
scan_response = table.scan()
print(f"Item count: {scan_response['Count']}")
all_scan = pd.DataFrame(scan_response['Items'])
display(all_scan)

Item count: 100


Unnamed: 0,artist_id,label_name,play_count,sold_amount_in_php,sk,artist_name,label_id,pk,song_title
0,ARTIST#7,"Guzman, Li and Collins",22352,237767,METADATA,Patrick Nguyen,LABEL#1,SONG#14,Care want develop doctor.
1,,,,,METADATA,Christopher Smith,LABEL#2,ARTIST#4,
2,ARTIST#4,"Edwards, Rodriguez and Ritter",19478,338494,METADATA,Christopher Smith,LABEL#2,SONG#9,Past summer.
3,ARTIST#9,"Guzman, Li and Collins",5988,746647,METADATA,Alicia Daniels,LABEL#1,SONG#63,House well lose.
4,ARTIST#5,Spence Inc,8367,767730,METADATA,Mr. Eric Johnson,LABEL#3,SONG#2,Consumer.
...,...,...,...,...,...,...,...,...,...
95,ARTIST#2,"Edwards, Rodriguez and Ritter",47485,296365,METADATA,Kevin Jones,LABEL#2,SONG#79,Fight approach.
96,ARTIST#15,Spence Inc,47434,61708,METADATA,David Delgado,LABEL#3,SONG#72,Reduce site.
97,ARTIST#11,"Edwards, Rodriguez and Ritter",57193,163448,METADATA,Mrs. Linda Mills,LABEL#2,SONG#45,Provide magazine message drop.
98,ARTIST#6,"Edwards, Rodriguez and Ritter",49417,588606,METADATA,Alexander Townsend,LABEL#2,SONG#29,Son clearly cold.


## Data Lake Setup
Roles:
- Data Engineer
- Data Scientist