# Genre is a Construct Project

First, we loaded the data in, and made sure to convert all qualitative data to quantitative so we could use them in model building. 

In [62]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
np.random.seed(42)
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer



from csv import reader


with open('cs305fp_music_genre.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    count = 0
    data = []
    titles = []
    artists = []
    instance_id = []
    genres = []
    musicKeys = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']
    
    for row in csv_reader:
        if count == 0:
            originalcol = row
            columns = row[3:14]+row[16:]
        else:
            instance_id.append(row[0])
            artists.append(row[1])
            titles.append(row[2])
            
            #music key (9 in list)
            if row[9] in musicKeys:
                row[9] = musicKeys.index(row[9])
            
            #mode (12 in list)
            if row[12] == 'Major':
                row[12] = 1
            elif row[12] == 'Minor':
                row[12] = 0
            
            
            #tempo (14 in list) 
            if row[14] == "?":
                row[14] = ''
            
            #music_genre (17 in list)
            if row[17] in genres and row[17] != 0:
                row[17] = genres.index(row[17])
            else:
                genres.append(row[17])
                row[17] = genres.index(row[17])
            
            #make sure to not include list items 0-2 and item 14,15 in list
            new = row[3:14]+row[16:]
            if row[17] != 0 and row[6] != '-1.0':
                data.append(new)
            
            
            
        count = count + 1
        
        
#Cleaning the data to chance all spaces to zeros and floats
total = 0
for count,row in enumerate(data):
    for count2,c in enumerate(row):
        if data[count][count2] != '':
            data[count][count2] = float(data[count][count2])
        else:
            total = total + 1
            #print(count,count2) #see which areas have missingness
            data[count][count2] = 0
            
#From the code above, we found that tempo has 4081 pieces of missingness, so we removed the column from the dataset
#as to eliminate bias

data = data[:-3] #remove NA rows 

#Change to np.array to make it easier to work with            
data = np.array(data)
 
#shape of our uploaded dataset
print(np.shape(data))

            

(40546, 13)


In [76]:
print(columns) #column names

['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'valence', 'music_genre']


In [65]:
print(genres) #all our genres

['Electronic', 'Anime', '', 'Jazz', 'Alternative', 'Country', 'Rap', 'Blues', 'Rock', 'Classical', 'Hip-Hop']


# Data Exploration and More Cleaning

In [77]:
import pandas as pd

df = pd.DataFrame(data)

In [78]:
dur = df.iloc[:,6]
for i in range(len(dur)):
    if dur[i] < 0.0:
        dur[i] = 0.00
        
#finding the average of the duration of the songs (in milliseconds)
avg = np.sum(dur) / len(dur)
avg

#replacing all 0.0 values
for i in range(len(dur)):
    if dur[i] == 0.0:
        dur[i] = avg
        
df.iloc[:,6] = dur
df=df.fillna(0)

# Model Building

In [96]:
#Scaling the Data 
DATA = np.array(df)
X = DATA[:,:-1]
y = DATA[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Below we build a Random Forest and kNN model. 

In [85]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import linear_model  # Using sklearn Perceptron classifier
from sklearn import tree  # Using Tree classifier
from sklearn import neighbors  # Using nearest neighbors classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import sklearn.svm as svm
from sklearn.preprocessing import StandardScaler

from sklearn import ensemble

learners = {
            'Forest': ensemble.RandomForestClassifier(),
            'kNN': neighbors.KNeighborsClassifier(),
           }
    

for classM in learners:
    print(classM)
    learners[classM].fit(X_train,y_train)
    print(learners[classM].score(X_test,y_test))




Forest
0.55154130702836
kNN
0.4908754623921085


Next, we look at the individual one versus all logistic and SVM models.

In [99]:
y_train1 = np.array(y_train,copy=True) #create a temporary y (the genres)
y_test1 = np.array(y_test,copy=True) #create a temporary y (the genres)

learners = {
            'Logistic': LogisticRegression()
           }


count = 0
for count in range(10): #for every genre
    count = count + 1
    for c,i in enumerate(y_train): #for every genre, we will do a one versus rest comparison and change the values to be 0 or 1
        if i != float(count):
            y_train1[c] = 0.0
        else: 
            y_train1[c] = 1.0

    for c1,i1 in enumerate(y_test): #for every genre, we will do a one versus rest comparison and change the values to be 0 or 1
        if i1 != float(count):
            y_test1[c1] = 0.0
        else: 
            y_test1[c1] = 1.0
    print(y_train)
    print(y_train1)

    #Running Logistic on this genre versus all the rest, and looking at the coefficients for interpretability
    for classM in learners:
        print(classM)
        learners[classM].fit(X_train1,y_train1)
        print(learners[classM].coef_)
        print(learners[classM].score(X_test1,y_test1))

    
    

[10.  4.  3. ...  8.  7.  1.]
[0. 0. 0. ... 0. 0. 1.]
Logistic
[[-0.00411209  0.04006726  0.01131111 -0.00948009  0.03604135  0.00645661
   0.01862778  0.00033903 -0.02193301  0.00405736  0.00460544  0.03198464]]
0.8893958076448829
[10.  4.  3. ...  8.  7.  1.]
[0. 0. 0. ... 0. 0. 0.]
Logistic
[[-0.39279531  0.69514429  0.28185     0.12273654 -0.84741803  0.24957125
  -0.44174829  0.0803512   0.75570489  0.74670504 -0.66705132 -0.09732633]]
0.9998766954377312
[10.  4.  3. ...  8.  7.  1.]
[0. 0. 1. ... 0. 0. 0.]
Logistic
[[ 0.00411679  0.0029762   0.03495118  0.0167104   0.05501808 -0.0103698
  -0.00436803 -0.00912021 -0.04844983  0.00745208 -0.00264344  0.01110715]]
0.8859432799013564
[10.  4.  3. ...  8.  7.  1.]
[0. 1. 0. ... 0. 0. 0.]
Logistic
[[ 0.00690969 -0.0098482  -0.00739023 -0.02116059 -0.01873358  0.0148223
   0.00292883 -0.01055916  0.0568025   0.01114619 -0.00845396 -0.02366428]]
0.8882860665844636
[10.  4.  3. ...  8.  7.  1.]
[0. 0. 0. ... 0. 0. 0.]
Logistic
[[-0.010929

Now, we will combine all of the one versus rest models using the one versus rest classifier to build a new model that can predict specific genres. 

In [90]:
#One versus Rest SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
   
# Creating the SVM model
model = OneVsRestClassifier(SVC())
   
# Fitting the model
model.fit(X_train, y_train)
   
# Making a prediction
prediction = model.predict(X_test)
   
# Evaluating the model
print(f"Test Set Accuracy : {accuracy_score(y_test, prediction)} %\n\n")
print(f"Classification Report : \n\n{classification_report(y_test, prediction)}")
print(f"Parameters : \n{model.get_params()}")

Test Set Accuracy : 0.5651048088779285 %


Classification Report : 

              precision    recall  f1-score   support

         1.0       0.71      0.78      0.74       900
         3.0       0.59      0.59      0.59       922
         4.0       0.50      0.23      0.32       905
         5.0       0.51      0.55      0.53       886
         6.0       0.43      0.46      0.45       886
         7.0       0.56      0.53      0.54       909
         8.0       0.48      0.75      0.58       867
         9.0       0.82      0.84      0.83       889
        10.0       0.47      0.37      0.41       946

    accuracy                           0.57      8110
   macro avg       0.56      0.57      0.56      8110
weighted avg       0.56      0.57      0.55      8110

Parameters : 
{'estimator__C': 1.0, 'estimator__break_ties': False, 'estimator__cache_size': 200, 'estimator__class_weight': None, 'estimator__coef0': 0.0, 'estimator__decision_function_shape': 'ovr', 'estimator__degree': 3, '

In [91]:
#One versus Rest Logistic
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
   
# Creating the Logistic model
model = OneVsRestClassifier(LogisticRegression())
   
# Fitting the model 
model.fit(X_train, y_train)
   
# Making a prediction
prediction = model.predict(X_test)
   
# Evaluating the model
print(f"Test Set Accuracy : {accuracy_score(y_test, prediction)} %\n\n")
print(f"Classification Report : \n\n{classification_report(y_test, prediction)}")
print(model.get_params())

Test Set Accuracy : 0.5255240443896424 %


Classification Report : 

              precision    recall  f1-score   support

         1.0       0.62      0.70      0.66       900
         3.0       0.54      0.45      0.49       922
         4.0       0.42      0.31      0.36       905
         5.0       0.42      0.57      0.49       886
         6.0       0.48      0.32      0.38       886
         7.0       0.52      0.41      0.46       909
         8.0       0.48      0.68      0.56       867
         9.0       0.76      0.80      0.78       889
        10.0       0.48      0.51      0.49       946

    accuracy                           0.53      8110
   macro avg       0.52      0.53      0.52      8110
weighted avg       0.52      0.53      0.52      8110

{'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estimator__multi_c

In [95]:
#One versus Rest Perceptron
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model  # Using sklearn Perceptron classifier
from sklearn.metrics import accuracy_score, classification_report
   
# Creating the Perceptron model
model = OneVsRestClassifier(linear_model.Perceptron(max_iter = 100))
   
# Fitting the model
model.fit(X_train, y_train)
   
# Making a prediction
prediction = model.predict(X_test)
   
# Evaluating the model
print(f"Test Set Accuracy : {accuracy_score(y_test, prediction)} \n\n")
print(f"Classification Report : \n{classification_report(y_test, prediction)}")
print(model.get_params())

Test Set Accuracy : 0.36189889025893957 


Classification Report : 
              precision    recall  f1-score   support

         1.0       0.58      0.44      0.50       900
         3.0       0.30      0.57      0.39       922
         4.0       0.21      0.25      0.23       905
         5.0       0.45      0.10      0.16       886
         6.0       0.31      0.00      0.01       886
         7.0       0.30      0.45      0.36       909
         8.0       0.46      0.24      0.32       867
         9.0       0.81      0.46      0.59       889
        10.0       0.32      0.70      0.44       946

    accuracy                           0.36      8110
   macro avg       0.42      0.36      0.33      8110
weighted avg       0.41      0.36      0.33      8110

{'estimator__alpha': 0.0001, 'estimator__class_weight': None, 'estimator__early_stopping': False, 'estimator__eta0': 1.0, 'estimator__fit_intercept': True, 'estimator__max_iter': 100, 'estimator__n_iter_no_change': 5, 'estimato

We also investigated the model of the one versus one classifier.

In [93]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
   
# Creating the SVM model
model = OneVsOneClassifier(SVC())
   
# Fitting the model with training data
model.fit(X_train, y_train)
   
# Making a prediction on the test set
prediction = model.predict(X_test)
   
# Evaluating the model
print(f"Test Set Accuracy : {accuracy_score(y_test, prediction)} %\n\n")
print(f"Classification Report : \n\n{classification_report(y_test, prediction)}")
print(f"Parameters : \n{model.get_params()}")

Test Set Accuracy : 0.5901356350184956 %


Classification Report : 

              precision    recall  f1-score   support

         1.0       0.78      0.73      0.75       900
         3.0       0.63      0.63      0.63       922
         4.0       0.47      0.39      0.42       905
         5.0       0.55      0.56      0.55       886
         6.0       0.46      0.37      0.41       886
         7.0       0.64      0.55      0.59       909
         8.0       0.49      0.72      0.59       867
         9.0       0.85      0.84      0.84       889
        10.0       0.48      0.53      0.50       946

    accuracy                           0.59      8110
   macro avg       0.59      0.59      0.59      8110
weighted avg       0.59      0.59      0.59      8110

Parameters : 
{'estimator__C': 1.0, 'estimator__break_ties': False, 'estimator__cache_size': 200, 'estimator__class_weight': None, 'estimator__coef0': 0.0, 'estimator__decision_function_shape': 'ovr', 'estimator__degree': 3, '

In [94]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
   
# Creating the Logistic model
model = OneVsOneClassifier(LogisticRegression())
   
# Fitting the model with training data
model.fit(X_train, y_train)
   
# Making a prediction on the test set
prediction = model.predict(X_test)
   
# Evaluating the model
print(f"Test Set Accuracy : {accuracy_score(y_test, prediction)} %\n\n")
print(f"Classification Report : \n\n{classification_report(y_test, prediction)}")
print(f"Parameters : \n{model.get_params()}")

Test Set Accuracy : 0.54845869297164 %


Classification Report : 

              precision    recall  f1-score   support

         1.0       0.66      0.71      0.69       900
         3.0       0.58      0.55      0.57       922
         4.0       0.42      0.37      0.40       905
         5.0       0.47      0.53      0.50       886
         6.0       0.44      0.35      0.39       886
         7.0       0.54      0.47      0.50       909
         8.0       0.51      0.66      0.57       867
         9.0       0.81      0.80      0.81       889
        10.0       0.49      0.49      0.49       946

    accuracy                           0.55      8110
   macro avg       0.55      0.55      0.55      8110
weighted avg       0.55      0.55      0.55      8110

Parameters : 
{'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estima