In [None]:
#load the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#loading the dataset
df = pd.read_csv('tcc_ceds_music.csv')
df.head() #previewing the dataset

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [None]:
#data preprocessing

#defining the relevant columns for the lyrical themes and musical features
theme_cols = ['dating', 'violence', 'world/life', 'night/time', 'shake the audience','family/gospel', 'romantic', 'communication', 'obscene', 'music','movement/places', 'light/visual perceptions', 'family/spiritual','like/girls', 'sadness', 'feelings']
musical_cols = ['valence', 'energy', 'danceability', 'acousticness', 'instrumentalness', 'loudness']
all_features = theme_cols + musical_cols

#dropping data with missing values in selected features or in release date
df_clean = df.dropna(subset=theme_cols + ['release_date'])

#defining a new column for decades
df_clean['decade'] = (df_clean['release_date'] // 10) * 10

#defining features matrix and labels
X = df_clean[all_features]
y = df_clean['decade']

#standardizing the features with standard scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, test_size=0.2, random_state=42)

#not needed, but helpful to add track names for the visualization
track_names = df['track_name']
_, track_names_test = train_test_split(track_names, test_size=0.2, random_state=42)


In [None]:
from sklearn.utils import resample

#grouping the data by decades
dfs_by_decade = [df_clean[df_clean['decade'] == decade] for decade in df_clean['decade'].unique()]

#find the smallest group size to downsample everything evenly
min_count = min(len(df) for df in dfs_by_decade)

#downsampling each group to match the smallest group
balanced_dfs = [resample(df, replace=False, n_samples=min_count, random_state=42) for df in dfs_by_decade]

#recombining into one balanced df
df_balanced = pd.concat(balanced_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

#prepping features and labels 2nd time
X = df_balanced[all_features]
y = df_balanced['decade']


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

#defining random forest params
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

#predicting using test data
y_pred_rf = rf_model.predict(X_test)

#evaluating the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)

#printing precision, recall, and F1-score per decade
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")
print("\nPrecision, Recall, and F1-Score per Decade:\n")
for decade in sorted(rf_report.keys()):
    if isinstance(decade, str) and decade.isdigit():
        metrics = rf_report[decade]
        print(f"{decade}: Precision = {metrics['precision']:.2f}, Recall = {metrics['recall']:.2f}, F1-score = {metrics['f1-score']:.2f}")


Random Forest Accuracy: 0.39

Precision, Recall, and F1-Score per Decade:

1950: Precision = 0.36, Recall = 0.34, F1-score = 0.35
1960: Precision = 0.35, Recall = 0.52, F1-score = 0.42
1970: Precision = 0.35, Recall = 0.34, F1-score = 0.34
1980: Precision = 0.42, Recall = 0.43, F1-score = 0.43
1990: Precision = 0.36, Recall = 0.20, F1-score = 0.26
2000: Precision = 0.36, Recall = 0.26, F1-score = 0.30
2010: Precision = 0.47, Recall = 0.60, F1-score = 0.53


In [None]:
#for file saving
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np

#generating the class probabilities
y_proba_rf = rf_model.predict_proba(X_test)
#df for storing the probabilities
proba_df = pd.DataFrame(y_proba_rf, columns=[f'prob_{decade}' for decade in rf_model.classes_])
proba_df['track_name'] = track_names_test.reset_index(drop=True)
proba_df['true_decade'] = y_test.reset_index(drop=True)
proba_df['predicted_decade'] = y_pred_rf

proba_df

Unnamed: 0,prob_1950,prob_1960,prob_1970,prob_1980,prob_1990,prob_2000,prob_2010,track_name,true_decade,predicted_decade
0,0.000000,0.001107,0.020571,0.126404,0.437854,0.251096,0.162968,velvet light,2000,1990
1,0.000000,0.005308,0.000378,0.011836,0.040755,0.234491,0.707231,"andy, you're a star",2010,2010
2,0.030878,0.083983,0.181162,0.290557,0.238137,0.093909,0.081373,with a little luck,1990,1980
3,0.007327,0.023547,0.043347,0.100155,0.107381,0.282279,0.435963,voodoo mon amour,2000,2010
4,0.000000,0.019708,0.074282,0.155747,0.167641,0.365213,0.217408,gulf coast highway (with willie nelson),1980,2000
...,...,...,...,...,...,...,...,...,...,...
5670,0.000109,0.023829,0.028154,0.058760,0.163821,0.293738,0.431589,"hi lili, hi lo",2000,2010
5671,0.007762,0.043424,0.165319,0.345176,0.280172,0.067916,0.090230,stop the hate,1990,1980
5672,0.008536,0.003763,0.064936,0.159507,0.147815,0.407158,0.208286,the king will come,2000,2000
5673,0.008739,0.057447,0.212275,0.340985,0.184931,0.123175,0.072447,kiss me black,1980,1980


In [None]:
#tuning the hyperparameters
from sklearn.model_selection import GridSearchCV

#grid of hyperparameters for searching
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}
#initializing rfc
rfc = RandomForestClassifier(class_weight='balanced', random_state=42)
#GridSearchCV to find the best combo of hyperparams
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
#searching on the training data
grid_search.fit(X_train, y_train)
#model with best performance from the grid search
print("Best Params:", grid_search.best_params_)
rf_model = grid_search.best_estimator_




In [None]:
#printing precision, recall, and F1-score per decade
print(f"Random Forest Accuracy: {rf_accuracy:.2f}\n")
print("Precision, Recall, and F1-Score per Decade:\n")

for decade in sorted(rf_model.classes_):
    decade_str = str(decade)
    if decade_str in rf_report:
        metrics = rf_report[decade_str]
        print(f"{decade}: Precision = {metrics['precision']:.2f}, Recall = {metrics['recall']:.2f}, F1 = {metrics['f1-score']:.2f}")


Random Forest Accuracy: 0.39

Precision, Recall, and F1-Score per Decade:

1950: Precision = 0.36, Recall = 0.34, F1 = 0.35
1960: Precision = 0.35, Recall = 0.52, F1 = 0.42
1970: Precision = 0.35, Recall = 0.34, F1 = 0.34
1980: Precision = 0.42, Recall = 0.43, F1 = 0.43
1990: Precision = 0.36, Recall = 0.20, F1 = 0.26
2000: Precision = 0.36, Recall = 0.26, F1 = 0.30
2010: Precision = 0.47, Recall = 0.60, F1 = 0.53


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

#getting x and predict on the complete balanced dataset
X = df_balanced[all_features]
rf_predictions = rf_model.predict(X)

#standardizing the feature space for pca
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#2d for visualizations
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

#creating the df for visualizations
viz_df = pd.DataFrame(X_pca, columns=["PCA_1", "PCA_2"])
viz_df["track_name"] = df_balanced["track_name"].values
viz_df["artist_name"] = df_balanced["artist_name"].values
viz_df["predicted_decade"] = rf_predictions
viz_df["actual_decade"] = df_balanced["decade"]

#saving to be exported to tableau
viz_df.to_csv("rf_cluster_visualization.csv", index=False)


