In [1]:
import pandas as pd
import seaborn as sns
from math import sqrt
from random import random

In [2]:
data = pd.read_csv("SpotifyData.csv")
data.dropna(inplace=True)
data.drop(columns=["duration", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"], inplace=True)
data

Unnamed: 0,artist,song,year,popularity,danceability,energy
0,Britney Spears,Oops!...I Did It Again,2000,77,0.751,0.834
1,blink-182,All The Small Things,1999,79,0.434,0.897
2,Faith Hill,Breathe,1999,66,0.529,0.496
3,Bon Jovi,It's My Life,2000,78,0.551,0.913
4,*NSYNC,Bye Bye Bye,2000,65,0.614,0.928
...,...,...,...,...,...,...
1995,Jonas Brothers,Sucker,2019,79,0.842,0.734
1996,Taylor Swift,Cruel Summer,2019,78,0.552,0.702
1997,Blanco Brown,The Git Up,2019,69,0.847,0.678
1998,Sam Smith,Dancing With A Stranger (with Normani),2019,75,0.741,0.520


In [3]:
# Normalize Data

data_normalized = data.copy()


normalize = lambda actual, minimum, maximum : (actual - minimum) / (maximum - minimum)

popularity_min, popularity_max = data["popularity"].min(), data["popularity"].max()
danceability_min, danceability_max = data["danceability"].min(), data["danceability"].max()
energy_min, energy_max = data["energy"].min(), data["energy"].max()

for index, row in data_normalized.iterrows():
    data_normalized.at[index, "popularity"] = normalize(row["popularity"], popularity_min, popularity_max)
    data_normalized.at[index, "danceability"] = normalize(row["danceability"], danceability_min, danceability_max)
    data_normalized.at[index, "energy"] = normalize(row["energy"], energy_min, energy_max)

data_normalized

Unnamed: 0,artist,song,year,popularity,danceability,energy
0,Britney Spears,Oops!...I Did It Again,2000,0.865169,0.735225,0.825230
1,blink-182,All The Small Things,1999,0.887640,0.360520,0.891961
2,Faith Hill,Breathe,1999,0.741573,0.472813,0.467217
3,Bon Jovi,It's My Life,2000,0.876404,0.498818,0.908908
4,*NSYNC,Bye Bye Bye,2000,0.730337,0.573286,0.924796
...,...,...,...,...,...,...
1995,Jonas Brothers,Sucker,2019,0.887640,0.842790,0.719309
1996,Taylor Swift,Cruel Summer,2019,0.876404,0.500000,0.685415
1997,Blanco Brown,The Git Up,2019,0.775281,0.848700,0.659994
1998,Sam Smith,Dancing With A Stranger (with Normani),2019,0.842697,0.723404,0.492638


In [4]:
# Split Data
split_index = int(len(data_normalized.index) * 0.75)
training_data, testing_data = data_normalized.iloc[:split_index], data_normalized.iloc[split_index:]

In [5]:
# Multiple Regression Algorithm
# ax + by + cz + d = 0
x, y, z = "energy", "danceability", "popularity"
def get_distance(data, model):
    # Model is a list of possible values [a,b,c,d]
    total_distance = 0

    # Coefficients
    a = model[0]
    b = model[1]
    c = model[2]
    d = model[3]

    for index, row in data.iterrows():
        total_distance += (abs((a * row[x]) + (b * row[y]) + (c * row[z]) + d)) / (sqrt(a * a) + (b * b) + (c * c) + (d * d))

    return total_distance

def random_increment(model): 
    return [ 
        model[0] + (random() - 0.5), 
        model[1] + (random() - 0.5), 
        model[2] + (random() - 0.5), 
        model[3] + (random() - 0.5)
    ]
def next_step(data, model):
    old_distance = get_distance(data, model)
    for i in range(50):
        new_model = random_increment(model)
        new_distance = get_distance(data, model)
        if new_distance < old_distance:
            return new_model
    return model
    


def multiple_regression(data, iterations):
    # Initial Values
    init_model = [0.5, 0.5, 0.5, 0.5]
    print("Initial Values:", init_model)

    init_model_distance = get_distance(data, init_model)

    possible_model = init_model.copy()

    for i in range(iterations):
        possible_model = next_step(data, possible_model)
    

    possible_model_distance = get_distance(data, possible_model)
    print("Best Fit Model Found:", possible_model, "with a distance of", possible_model_distance)

In [6]:
model = multiple_regression(training_data, 10)

Initial Values: [0.5, 0.5, 0.5, 0.5]


KeyboardInterrupt: 