In [9]:
import logging
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import col, desc, rand
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from difflib import SequenceMatcher
import requests
import pandas as pd

In [4]:
data_path = '2023030823_reddit.csv'

In [12]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class RecommendationEngine:
    
    def __init__(self, spark):
        logger.info("Starting up the Recommendation Engine: ")
        self.spark = spark
        self.spotify_data = self.__load_data_from_datalake()
        self.standardize_data, self.standardscaler, self.assemble = self.__vectorize_standardize_data()
        self.model, self.final_df = self.__train_model() 
    
    def __load_data_from_datalake(self) :
        logger.info("Loading Spotify data...")
        # spotify_data = self.spark.read.parquet('/datalake/SpotifyData').drop('post_id', 'post_created_utc', 'post_title', 'post_url', \
        #                                                                      'key', 'mode', 'speechiness', 'acousticness', 'liveness', \
        #                                                                      'tempo', 'duration_ms', 'time_signature', 'year', 'month', 'day')
        df = pd.read_csv(data_path)
        schema = StructType([StructField("post_id", StringType(), True)\
                            ,StructField("post_created_utc", StringType(), True)\
                            ,StructField("post_title", StringType(), True)
                            ,StructField("post_url", StringType(), True)\
                            ,StructField("artists_id", StringType(), True)\
                            ,StructField("artists_name", StringType(), True)\
                            ,StructField("track_id", StringType(), True)\
                            ,StructField("track_name", StringType(), True)\
                            ,StructField("release_date", StringType(), True)\
                            ,StructField("genres", StringType(), True)\
                            ,StructField("track_popularity", IntegerType(), True)\
                            ,StructField("danceability", FloatType(), True)\
                            ,StructField("energy", FloatType(), True)\
                            ,StructField("key", IntegerType(), True)\
                            ,StructField("mode", StringType(), True)\
                            ,StructField("loudness", FloatType(), True)\
                            ,StructField("speechiness", FloatType(), True)\
                            ,StructField("acousticness", FloatType(), True)\
                            ,StructField("instrumentalness", FloatType(), True)\
                            ,StructField("liveness", FloatType(), True)\
                            ,StructField("valence", FloatType(), True)\
                            ,StructField("tempo", FloatType(), True)\
                            ,StructField("duration_ms", IntegerType(), True)\
                            ,StructField("time_signature", IntegerType(), True)])
        spotify_data = self.spark.createDataFrame(df, schema = schema).drop('post_id', 'post_created_utc', 'post_title', 'post_url', \
                                                                             'key', 'mode', 'speechiness', 'acousticness', 'liveness', \
                                                                             'tempo', 'time_signature', 'year', 'month', 'day')
        def is_similar(str1, str2):
            matcher = SequenceMatcher(None, str1, str2)
            similarity = matcher.ratio()
            return similarity 
        
        def get_genres(text):
            if text is None:
                return None
            genres = {'country': 0, 'electronic': 0, 'funk': 0, 'hip hop': 0, 'jazz': 0, 'rap': 0, 'classical': 0, 'dance': 0, 'soul': 0, 
                      'indie': 0, 'latin': 0, 'pop': 0, 'punk': 0, 'reggae': 0, 'rock': 0, 'metal': 0, 'r&b': 0, 'house': 0, 'techno': 0, 'folk': 0}
            for each in text.split(', '):
                each = each.replace(' ', '')
                for genre in genres:
                    genres[genre] += is_similar(genre, each)
            sorted_genres = dict(sorted(genres.items(), key=lambda item: item[1], reverse = True))
            return next(iter(sorted_genres))
        get_genres_udf = functions.udf(get_genres, StringType())
        spotify_data = spotify_data.withColumn("major_genre", get_genres_udf(spotify_data["genres"]))
        return spotify_data
    
    def __vectorize_standardize_data(self):
        spotify_data = self.spotify_data
        spotify_data = spotify_data.na.drop()
        assemble = VectorAssembler(inputCols = ['danceability',
                                                'energy',
                                                'loudness',
                                                'instrumentalness',
                                                'valence'], outputCol='features')
        assembled_data = assemble.transform(spotify_data)
        
        scale = StandardScaler(inputCol = 'features',outputCol = 'standardized')
        standardscaler = scale.fit(assembled_data)
        standardize_data = standardscaler.transform(assembled_data)
        return standardize_data, standardscaler, assemble

    def __train_model(self):
        clusters = 5
        seed = 3
        logger.info("Training K-Means model...")
        
        KMeans_algo = KMeans(featuresCol = 'standardized', k = clusters, seed = seed)
        model = KMeans_algo.fit(self.standardize_data)
        final_df = model.transform(self.standardize_data)
        
        logger.info("K-Means model built!")
        return model, final_df
    
    def get_recommendation(self, danceability = 0.5, energy = 0.5, loudness = -10.1, instrumentalness = 0.5, valence = 0.5):
        input_ = [(danceability, energy, loudness, instrumentalness, valence)]
        schema = StructType([StructField("danceability", FloatType(), True)\
                            ,StructField("energy", FloatType(), True)\
                            ,StructField("loudness", FloatType(), True)\
                            ,StructField("instrumentalness", FloatType(), True)\
                            ,StructField("valence", FloatType(), True)])
        input_df = self.spark.createDataFrame(data = input_,schema = schema)
        
        assembled_input_data = self.assemble.transform(input_df)
        
        standardize_input_data = self.standardscaler.transform(assembled_input_data)
        predicted_cluster = self.model.transform(standardize_input_data).select('prediction').collect()[0][0]
        
        similar_cluster_songs = self.final_df.filter(col('prediction') == predicted_cluster).orderBy(desc('track_popularity'))
        recommendation = similar_cluster_songs.limit(10).orderBy(rand()).limit(5).orderBy(desc('track_popularity')).collect()
        return recommendation

In [6]:
spark = SparkSession.builder.appName("test") \
        .getOrCreate()

In [13]:
engine = RecommendationEngine(spark)

INFO:__main__:Starting up the Recommendation Engine: 
INFO:__main__:Loading Spotify data...
  for column, series in pdf.iteritems():
INFO:__main__:Training K-Means model...
INFO:__main__:K-Means model built!


In [14]:
def get_spotify_bearer_token():
    # get spotify api info
    auth_url = 'https://accounts.spotify.com/api/token'
    client_id = '9e2609ac010148adaea6f6bf8c75ba66'
    client_secret = 'c4a50051e7c24edf84ad5a865c162f22'
    # connect to API
    payload = {"client_id": client_id, "client_secret": client_secret, "grant_type": "client_credentials"}
    response = requests.post(auth_url, data = payload)
    if response.status_code != 200:
        print("Error Response Code: " + str(response.status_code))
        raise Exception(response.status_code, response.text)
    access_token = response.json()["access_token"]
    header = {"Authorization": "Bearer " + access_token}
    return header
header = get_spotify_bearer_token()

In [15]:
import flask
import requests
import random
danceability_option = ['Perfect For Dancing', 'Rhythmic', 'Not Danceable']
energy_option = ['Extremely Exciting', 'Quite Exciting', 'Calm']
instrumentalness_option = ['Yes', 'No']
loudness_option = ['Very Loud', 'Average Loudness', 'Soft']
valence_option = ['Highly Positive', 'Positive', 'Balanced', 'Negative', 'Highly Negative']

def translate_danceability(danceability): 
    if danceability == 'Not Danceable':
        return random.uniform(0, 0.3)
    elif danceability == 'Rhythmic':
        return random.uniform(0.3, 0.6)
    elif danceability == 'Perfect For Dancing':
        return random.uniform(0.6, 1)
    elif danceability == '':
        return 0.5
    
def translate_energy(energy): 
    if energy == 'Calm':
        return random.uniform(0, 0.3)
    elif energy == 'Quite Exciting':
        return random.uniform(0.3, 0.6)
    elif energy == 'Extremely Exciting':
        return random.uniform(0.6, 1)
    elif energy == '':
        return 0.5

def translate_loudness(loudness):
    if loudness == 'Soft':
        return random.uniform(-5, 0)
    elif loudness == 'Average Loudness':
        return random.uniform(-10, -5)
    elif loudness == 'Very Loud':
        return random.uniform(-20, -10)
    elif loudness == '':
        return 0.5
    
def translate_instrumentalness(instrumentalness): 
    if instrumentalness == 'No':
        return random.uniform(0, 0.5)
    elif instrumentalness == 'Yes':
        return random.uniform(0.5, 1)
    elif loudness == '':
        return 0.5

def translate_valence(valence): 
    if valence == 'Highly Negative':
        return random.uniform(0, 0.2)
    elif valence == 'Negative':
        return random.uniform(0.2, 0.4)
    elif valence == 'Balanced':
        return random.uniform(0.4, 0.6)
    elif valence == 'Positive':
        return random.uniform(0.6, 0.8)
    elif valence == 'Highly Positive':
        return random.uniform(0.8, 1)
    elif valence == '':
        return 0.5

app = flask.Flask(__name__, template_folder = ('template'))
@app.route('/', methods = ['GET', 'POST'])
def main():
    if flask.request.method == 'GET':
        return flask.render_template('index.html', danceability_option = danceability_option, energy_option = energy_option, \
                                     loudness_option = loudness_option, instrumentalness_option = instrumentalness_option, \
                                     valence_option = valence_option)
            
    if flask.request.method == 'POST':
        danceability = translate_danceability(flask.request.form['danceability_option'])
        energy = translate_energy(flask.request.form['energy_option'])
        loudness = translate_loudness(flask.request.form['loudness_option'])
        instrumentalness = translate_instrumentalness(flask.request.form['instrumentalness_option'])
        valence = translate_valence(flask.request.form['valence_option'])
        results = engine.get_recommendation(danceability, energy, loudness, instrumentalness, valence)
        
        recommendations = []
        for result in results: 
            recommendation = []
            artirst_name = result[1]
            track_id = result[2]
            track_name = result[3]
            release_date = result[4]
            track_popularity = result[6]
            duration = int(result[12]) / 1000
            genre = result[13]
            response = requests.get("https://api.spotify.com/v1/tracks/", headers = header, params = {'ids': track_id})
            link_cover_image = response.json()['tracks'][0]['album']['images'][0]['url']
            
            recommendation.append(track_name)
            recommendation.append(release_date)
            recommendation.append(genre)
            recommendation.append(duration)
            recommendation.append(track_popularity)
            recommendation.append(link_cover_image)
            recommendations.append(recommendation)
        return flask.render_template('result.html', recommendations = recommendations)

if __name__ == '__main__':
    app.run(host = "0.0.0.0")

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.23.0.3:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:30] "GET / HTTP/1.1" 200 -
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:38] "POST / HTTP/1.1" 200 -
INFO:py4j.clientserver:Closing down clientserver connection
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:38] "GET / HTTP/1.1" 200 -
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:45] "POST / HTTP/1.1" 200 -
INFO:py4j.clientserver:Closing down clientserver connection
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:45] "GET / HTTP/1.1" 200 -
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:55] "POST / HTTP/1.1" 200 -
INFO:py4j.clientserver:Closing down clientserver connection
INFO:werkzeug:172.23.0.1 - - [12/May/2023 03:00:55] "GET / HTTP/1.1" 200 -
