In [2]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

columns_of_interest = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'class']

df = pd.read_csv('processed_liked_songs.csv') 
unlikedSongs = pd.read_csv('unlikedProcessedSongs.csv')

frames = [df, unlikedSongs]

totalData = pd.concat(frames)


In [4]:

class DecisionNode:
    def __init__(self, feature=None, value=None, left=None, right=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right


class DecisionTree:

    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.availableAttributes = {'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'}
        self.root = None

    def should_stop_splitting(self, depth, data):
        return depth == self.max_depth or len(data) < self.min_samples_split


    def split_data(self, data, split_column): #split data on specific attribute and if its InRange or OutOfRange 
        
        split_column_values = data[split_column]
        return data[split_column_values == 'InRange' ], data[split_column_values == 'OutOfRange']


    def find_best_split(self, data, availableAttributes):  # input subdata to split on branch using entropy and info gain
        bestFeature = None
        bestInformationGain = -1
        for attribute in availableAttributes: # based off available attributes itereate through each one and calculate information gain
            currentAttribute = attribute

            split1, split2 = self.split_data(data, currentAttribute) #split data in 2

            currentInformationGain = self.informationGain(split1, split2, data) #calculate information gain for current attribute split

            if(currentInformationGain > bestInformationGain):
                bestInformationGain=currentInformationGain# check if this is a better informaion gain than the current best
                bestFeature = currentAttribute#if so update the current best feature

        return bestFeature #return best feature
                   
    
    def informationGain(self, split1, split2, data):#previous data and its splits as well as parent attribute as input
        parentEntropy = self.calcEntropy(data) #calc entropy of inputted data
        
        totalInstances = len(split1) + len(split2) #count all instances

        split1Entropy = self.calcEntropy(split1) #calc entropy of each split to use for child entropy
        split2Entropy = self.calcEntropy(split2) 

        childEntropy = (len(split1) / totalInstances) * split1Entropy + (len(split2) / totalInstances) * split2Entropy #calculate child entropy

        gain = parentEntropy - childEntropy #calculate information gain for split

        return gain

    def calcEntropy(self,subset):

        counts = subset['class'].value_counts() 

        entropy = 0

        for count in counts:
            probability = count / len(subset)
            entropy -= probability * math.log2(probability)

        return entropy
   

    def getPurity(data, node):
        pass

    def build_tree(self, data, availableAttributes, depth=0): # available attributes will be instance variable and recursively update it for left and right
        
        if self.should_stop_splitting(depth, data):
            return DecisionNode(value=self.get_majority_class(data))

        best_feature = self.find_best_split(data, availableAttributes)#current data and attributeList info (recursively updated)

        left_subset, right_subset = self.split_data(data, best_feature)

        node = DecisionNode(feature=best_feature)
        
        left_availbleAttributes = availableAttributes.difference({best_feature})
        node.left = self.build_tree(left_subset, left_availbleAttributes, depth+1)
    
        right_availableAttributes = availableAttributes.difference({best_feature})
        node.right = self.build_tree(right_subset, right_availableAttributes, depth+1)

        if best_feature is None:
            return DecisionNode(value=self.get_majority_class(data))

        return node
    
    def fit(self, data):
                  
        self.root = self.build_tree(data, self.availableAttributes)


    def get_majority_class(self, data):
        class_counts = data['class'].value_counts()
        
        # Ensure there's at least one class
        if not class_counts.empty:
            majority_class = class_counts.idxmax()
            return majority_class

        # Return a default value if there are no classes
        return 'DefaultMajorityClass'
    
    
    def predict(self, input_data):
        current_node = self.root

        while current_node.left or current_node.right:
            feature_name = current_node.feature
            feature_value = input_data.get(feature_name)  # Use get to handle missing keys

            if feature_value == 'InRange':
                current_node = current_node.left
            else:
                current_node = current_node.right

        return current_node.value

        
    def calculateAccuracy(self, predictedResults, actualResults):
        return accuracy_score(predictedResults,actualResults)


decision_tree = DecisionTree(max_depth=30, min_samples_split=100)

shuffled_df = totalData.sample(frac=1, random_state=42)
train_set, test_set = train_test_split(shuffled_df, test_size=0.2, random_state=42)

actualResults = test_set['class']

test_set_without_class = test_set.drop('class', axis=1)

decision_tree.fit(train_set)

results = []

for index, row in test_set_without_class.iterrows():
    print(decision_tree.predict(row))
    results.append(decision_tree.predict(row))

accuracy = decision_tree.calculateAccuracy(actualResults, results)

print(f"Accuracy: {accuracy * 100:.2f}%")                      

OutOfRange
OutOfRange
OutOfRange
InRange
InRange
InRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
InRange
OutOfRange
OutOfRange
OutOfRange
InRange
InRange
InRange
InRange
OutOfRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
InRange
OutOfRange
InRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
InRange
OutOfRange
InRange
InRange
InRange
InRange
OutOfRange
OutOfRange
OutOfRange
InRange
InRange
InRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
InRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
OutOfRange
InRange
OutOfRange
OutOfRange
InRange
OutOfRange
InRange
InRange
OutOfRange
InRange
OutOfRange
InRange
OutOfRange
InRange
InRange
InRange
OutOfRange
OutOfRange
InRange
InRange
OutOfRange
InRange
O