In [None]:
!pip install xgboost



In [None]:
#loading the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

#loading the dataset
df = pd.read_csv('tcc_ceds_music.csv')
df.head()

#defining relevent feature columns among themes and musical attributes
theme_cols = ['dating', 'violence', 'world/life', 'night/time', 'shake the audience',
              'family/gospel', 'romantic', 'communication', 'obscene', 'music',
              'movement/places', 'light/visual perceptions', 'family/spiritual',
              'like/girls', 'sadness', 'feelings']

musical_cols = ['valence', 'energy', 'danceability', 'acousticness', 'instrumentalness', 'loudness']
all_features = theme_cols + musical_cols

#dropping rows with missing theme or release date values
df_clean = df.dropna(subset=theme_cols + ['release_date'])

#creating a new column for decade
df_clean['decade'] = (df_clean['release_date'] // 10) * 10

#preparing features (X) and target labels (y)
X = df_clean[all_features]
y = df_clean['decade']
le = LabelEncoder()
y = le.fit_transform(y)

#standardizing the features with standard scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

#not needed, but helpful to add track names for the visualization
track_names = df_clean['track_name']
_, track_names_test = train_test_split(track_names, test_size=0.2, random_state=42)

#defining hyperparameter grid for XGBoost tuning
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

#initialize XGBoost classifier with GPU acceleration and multi-class objective
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
#performing grid search cross-validation to find best hyperparameters
grid = GridSearchCV(xgb, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

#printing the best parameters and CV score
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)
#evaluating the best model on the test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
#printing evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Fitting 3 folds for each of 72 candidates, totalling 216 fits



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", dev

Best params: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Best score: 0.3921225782694429
Accuracy: 0.3940088105726872

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.27      0.30       294
           1       0.38      0.52      0.44       682
           2       0.35      0.35      0.35       790
           3       0.42      0.43      0.43       935
           4       0.34      0.21      0.26       892
           5       0.34      0.25      0.29       956
           6       0.46      0.63      0.53      1126

    accuracy                           0.39      5675
   macro avg       0.38      0.38      0.37      5675
weighted avg       0.38      0.39      0.38      5675


Confusion Matrix:
[[ 78 162  17  11  12   7   7]
 [ 51 352 122  42  27  36  52]
 [ 14 134 276 182  61  55  68]
 [ 22  80 149 401 114  79  90]
 [ 27  91 111 194 183  96 190]
 [ 27  66  71  60  76 237 419]



    E.g. tree_method = "hist", device = "cuda"



In [None]:
y_test

array([5, 6, 4, ..., 5, 3, 4])

In [None]:
import numpy as np

#generating the class probabilities
y_proba_best_model = best_model.predict_proba(X_test)
#creating a dataframe to store the probabilities
proba_df = pd.DataFrame(y_proba_best_model, columns=[f'prob_{decade}' for decade in best_model.classes_])
proba_df['track_name'] = track_names_test.reset_index(drop=True)
proba_df['true_decade'] = y_test
proba_df['predicted_decade'] = y_pred

proba_df

Unnamed: 0,prob_0,prob_1,prob_2,prob_3,prob_4,prob_5,prob_6,track_name,true_decade,predicted_decade
0,0.000627,0.002559,0.008820,0.125318,0.422500,0.238805,0.201372,velvet light,5,4
1,0.000417,0.002000,0.002078,0.008325,0.038558,0.213127,0.735496,"andy, you're a star",6,6
2,0.006628,0.056522,0.111760,0.282478,0.378199,0.095459,0.068953,with a little luck,4,4
3,0.001170,0.014191,0.016966,0.062090,0.070612,0.234595,0.600376,voodoo mon amour,5,6
4,0.001014,0.009169,0.023478,0.151939,0.144545,0.493994,0.175860,gulf coast highway (with willie nelson),3,5
...,...,...,...,...,...,...,...,...,...,...
5670,0.000753,0.004498,0.014484,0.054428,0.061098,0.300333,0.564407,"hi lili, hi lo",5,6
5671,0.002382,0.011142,0.093925,0.446571,0.314460,0.077351,0.054169,stop the hate,4,3
5672,0.001452,0.009175,0.036008,0.202699,0.122420,0.431087,0.197159,the king will come,5,5
5673,0.001127,0.037641,0.191934,0.305674,0.167631,0.233586,0.062408,kiss me black,3,3


In [None]:
proba_df.rename(columns={'prob_0': '1950', 'prob_1': '1960', 'prob_2': '1970', 'prob_3': '1980', 'prob_4': '1990', 'prob_5': '2000', 'prob_6': '2010'}, inplace=True)
proba_df.to_csv('xgboost_decade_probabilities.csv')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

#extracting features and target labels from the cleaned dataset
X = df_clean[all_features]
y = df_clean['decade']

#transforming true labels into encoded form using fitted LabelEncoder
y_encoded = le.transform(y)
#using the trained XGBoost model to predict decades for all songs
xgb_predictions = best_model.predict(scaler.transform(X))

#standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#applying PCA to reduce feature space from 22D to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

#creating a dataframe for visualization
viz_df = pd.DataFrame(X_pca, columns=["PCA_1", "PCA_2"])
viz_df["track_name"] = df_clean["track_name"].values
viz_df["artist_name"] = df_clean["artist_name"].values
viz_df["predicted_decade"] = le.inverse_transform(xgb_predictions)
viz_df["actual_decade"] = df_clean["decade"].values

#exporting the visualization data to CSV (used for plotting clusters in Tableau or similar tools)
viz_df.to_csv("xgboost_cluster_visualization.csv", index=False)
