In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import boto3
import s3fs

def get_similar_songs(song_ids):
    # Read the CSV file from the S3 bucket
    playlist_song_ids = set(playlist_song_ids)
    
    s3 = boto3.client('s3')
    s3_file = s3fs.S3FileSystem(anon=False)
    df = pd.read_csv(s3_file.open('s3://your-bucket-name/your-file-name.csv'))

    # Calculate cosine similarity between song features
    similarity_matrix = cosine_similarity(df.drop('song_id', axis=1))

    # Get the indices of the songs in the incoming list
    song_indices = df[df['song_id'].isin(song_ids)].index.tolist()

    # Get the most similar songs
    similar_songs = []
    for idx in song_indices:
        # Get the index of the most similar song
        similar_song_idx = similarity_matrix[idx].argsort()[-2]  # -1 would be the song itself
        # Get the song id of the most similar song
        similar_song_id = df.iloc[similar_song_idx]['song_id']
        similar_songs.append(similar_song_id)
        
    recommended_song_ids = [song_id for song_id in similar_songs if song_id not in playlist_song_ids]


    return similar_songs


if __name__ == '__main__':
    main()



In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                     role=sagemaker.get_execution_role(),
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1)


In [None]:
from sagemaker import Model
from sagemaker import Predictor
from sagemaker.sklearn import SKLearnModel

# Create a model from the SKLearnProcessor
model = SKLearnModel(model_data=sklearn_processor.model_data,
                     role=sagemaker.get_execution_role(),
                     entry_point='your-python-script.py')

# Deploy the model to create an endpoint
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

# Now you can use the `predictor` to get similar songs
similar_songs = predictor.predict(song_ids)


In [None]:
%%writefile code/preprocess.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import tarfile

def load_data(file_path):
    """
    Load the dataset from the given file path.
    """
    data = pd.read_csv(file_path, header=None)
    return data

def split_data(data, train_size=0.8, random_state=42):
    """
    Split the data into training, validation, and test sets.
    """
    train_data, temp_data = train_test_split(data, train_size=train_size, random_state=random_state)
    validation_data, test_data = train_test_split(temp_data, train_size=0.5, random_state=random_state)
    return train_data, validation_data, test_data

def save_data(data, file_path):
    """
    Save the data to the given file path.
    """
    np.savetxt(file_path, data, delimiter=",")

def main():
    # Load the data
    data = load_data('/opt/ml/processing/input/data_banknote_authentication.txt')

    # Split the data into training, validation, and test sets
    train_data, validation_data, test_data = split_data(data)

    # Separate the features and the labels in the datasets
    train_features = train_data.iloc[:, :-1]
    train_labels = train_data.iloc[:, -1]
    validation_features = validation_data.iloc[:, :-1]
    validation_labels = validation_data.iloc[:, -1]
    test_features = test_data.iloc[:, :-1]
    test_labels = test_data.iloc[:, -1]

    # Fit the scaler to the training features and transform them
    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_features)

    # Apply the scaler to the validation and test features
    validation_features = scaler.transform(validation_features)
    test_features = scaler.transform(test_features)

    # Save the scaler to a file
    # joblib.dump(scaler, '/opt/ml/processing/train/scaler.joblib')
    joblib.dump(scaler, "model.joblib")
    with tarfile.open(f"opt/ml/processing/scaler_model/model.tar.gz", "w:gz") as tar_handle:
        tar_handle.add(f"model.joblib")
    
    # with tarfile.open(f's3://{bucket_name}/{s3_prefix}/scaler_model', "w:gz") as tar_handle:
    #     tar_handle.add(f"model.joblib")
    
    # Combine the features and labels back into complete datasets
    train_data = np.column_stack((train_labels, train_features))
    validation_data = np.column_stack((validation_labels, validation_features))
    test_data = np.column_stack((test_labels, test_features))

    # Save the processed data
    save_data(train_data, '/opt/ml/processing/train/train.csv')
    save_data(validation_data, '/opt/ml/processing/validation/validation.csv')
    save_data(test_data, '/opt/ml/processing/test/test.csv')

if __name__ == '__main__':
    main()


In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=sklearn_framework_version,
    instance_type='ml.m5.large',
    instance_count=processing_instance_count,
    base_job_name="preprocessing-scaling",
    role=role,
    sagemaker_session=pipeline_session,
)

destination = f's3://{bucket_name}/{s3_prefix}/scaler_model'
print(bucket_name)

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="scaler_model", source="/opt/ml/processing/scaler_model", destination=destination),
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), 
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code="code/preprocess.py",
)
