In [1]:
import sys
import os
sys.path.append(os.path.abspath("../../../../"))

import pandas as pd
import numpy as np
import utils
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Evaluation
from sklearn.metrics import classification_report
import plot_automations as plotter

In [2]:
df = pd.read_csv("../../../../01_color_clustering/non-weighted/omniart-post-color-clustering.csv")

In [3]:
# Add decade to creation intervals
df["decade"] = (df["creation_year"] // 10 * 10).astype(int)

In [4]:
color_clusters = pd.read_csv("../../../../01_color_clustering/non-weighted/color_centroids.csv")

**1. Features**

In [5]:
color_names = color_clusters["HEX"].tolist()
color_to_index = {name: i for i, name in enumerate(color_names)}
n_colors = len(color_names)

In [6]:
color_X = np.vstack(df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [7]:
artist_encoder = LabelEncoder()
artist_X = artist_encoder.fit_transform(df["artist_full_name"]).reshape(-1, 1)

In [8]:
school_encoder = LabelEncoder()
school_X = school_encoder.fit_transform(df["school"]).reshape(-1, 1)

In [9]:
X = np.hstack([color_X, artist_X, school_X])

**2. Labels**

In [10]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["decade"])

**3. Split dataset**

In [11]:
X_train,  X_test,  y_train,  y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**4. Train Random Forest Classifier**

In [12]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(X_train, y_train)

In [24]:
class_labels = label_encoder.inverse_transform(np.arange(len(label_encoder.classes_)))

**5. Predict**

In [13]:
y_pred = clf.predict(X_test)

**6. Evaluate**

In [14]:
labels_in_test = np.unique(y_test)

In [15]:
class_names_in_test = [str(label_encoder.classes_[i]) for i in labels_in_test]

In [16]:
# Generate report as dict
report_dict = classification_report(
    y_test, y_pred,
    labels=labels_in_test,
    target_names=class_names_in_test,
    zero_division=0,
    output_dict=True
)

In [17]:
report = pd.DataFrame(report_dict).transpose()

In [18]:
# Sort by support
summary_rows = report.loc[report.index.str.contains("avg|accuracy")]
class_rows = report.loc[~report.index.str.contains("avg|accuracy")].copy()
class_rows["support"] = class_rows["support"].astype(int)
class_rows_sorted = class_rows.sort_values(by="support", ascending=False)

In [20]:
report = pd.concat([class_rows_sorted, summary_rows])

In [22]:
report

Unnamed: 0,precision,recall,f1-score,support
2010,0.798503,0.967927,0.875090,27562.0
2000,0.738019,0.272932,0.398494,9310.0
1890,0.440415,0.722380,0.547210,1412.0
1880,0.536634,0.350129,0.423769,774.0
1910,0.367318,0.417460,0.390788,630.0
...,...,...,...,...
0,0.000000,0.000000,0.000000,1.0
900,0.000000,0.000000,0.000000,1.0
micro avg,0.684890,0.684825,0.684857,52850.0
macro avg,0.409287,0.391793,0.372206,52850.0


In [23]:
report.to_csv("decade_classifier_report.csv")

**7. Features importance**

In [27]:
importances = clf.feature_importances_

In [31]:
color_feature_names = color_names
artist_feature_name = ["artist"]
school_feature_name = ["school"]

feature_names = color_feature_names + artist_feature_name + school_feature_name

In [35]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [36]:
importance_df

Unnamed: 0,feature,importance
1000,artist,0.551617
1001,school,0.448383
672,#c1afbe,0.000000
659,#b39341,0.000000
660,#ed5714,0.000000
...,...,...
339,#dac175,0.000000
340,#9aabbc,0.000000
341,#52697d,0.000000
342,#8cb99d,0.000000


In [37]:
importance_df.to_csv("decade_feature_importance.csv", index=False)