In [2]:
import sys
import os
sys.path.append(os.path.abspath("../../../../"))

import pandas as pd
import numpy as np
import utils
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Evaluation
from sklearn.metrics import classification_report
import plot_automations as plotter

In [3]:
df = pd.read_csv("../../../../01_color_clustering/weighted/weighted_omniart-post-color-clustering.csv")

In [4]:
# Add decade to creation intervals
df["decade"] = (df["creation_year"] // 10 * 10).astype(int)

In [5]:
color_clusters = pd.read_csv("../../../../01_color_clustering/weighted/weighted_color_centroids.csv")

**1. Features**

In [6]:
color_names = color_clusters["HEX"].tolist()
color_to_index = {name: i for i, name in enumerate(color_names)}
n_colors = len(color_names)

In [7]:
color_X = np.vstack(df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [8]:
X = color_X

**2. Labels**

In [9]:
# Step 3: Encode decade labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["decade"])

**3. Split dataset**

In [10]:
# Step 4: Split and train
X_train,  X_test,  y_train,  y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**4. Train Random Forest Classifier**

In [11]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(X_train, y_train)

**5. Predict**

In [12]:
y_pred = clf.predict(X_test)

**6. Evaluate**

In [13]:
labels_in_test = np.unique(y_test)

In [14]:
class_names_in_test = [str(label_encoder.classes_[i]) for i in labels_in_test]

In [15]:
# Generate report as dict
report_dict = classification_report(
    y_test, y_pred,
    labels=labels_in_test,
    target_names=class_names_in_test,
    zero_division=0,
    output_dict=True
)

In [16]:
report = pd.DataFrame(report_dict).transpose()

In [17]:
# Sort by support
summary_rows = report.loc[report.index.str.contains("avg|accuracy")]
class_rows = report.loc[~report.index.str.contains("avg|accuracy")].copy()
class_rows["support"] = class_rows["support"].astype(int)
class_rows_sorted = class_rows.sort_values(by="support", ascending=False)

In [18]:
report = pd.concat([class_rows_sorted, summary_rows])

In [19]:
report

Unnamed: 0,precision,recall,f1-score,support
2010,0.521514,1.000000,0.685520,27562.000000
2000,0.000000,0.000000,0.000000,9310.000000
1890,0.000000,0.000000,0.000000,1412.000000
1880,0.000000,0.000000,0.000000,774.000000
1910,0.000000,0.000000,0.000000,630.000000
...,...,...,...,...
0,0.000000,0.000000,0.000000,1.000000
900,0.000000,0.000000,0.000000,1.000000
accuracy,0.521514,0.521514,0.521514,0.521514
macro avg,0.004829,0.009259,0.006347,52850.000000


In [20]:
report.to_csv("weighted_color-decade_classifier_report.csv")

**7. Features importance**

In [21]:
importances = clf.feature_importances_

In [22]:
color_feature_names = color_names

feature_names = color_feature_names

In [23]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [24]:
importance_df

Unnamed: 0,feature,importance
0,#eab2c1,0.0
671,#8b9585,0.0
658,#4a0e1e,0.0
659,#8b6f52,0.0
660,#49284f,0.0
...,...,...
338,#81564b,0.0
339,#37531c,0.0
340,#c88f75,0.0
341,#eceff4,0.0


In [25]:
importance_df.to_csv("weighted_color-decade_feature_importance.csv", index=False)