In [1]:
import sys
import os
sys.path.append(os.path.abspath("../../../../"))

import numpy as np
import pandas as pd
import utils
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
# Evaluation
from sklearn.metrics.pairwise import cosine_similarity
import plot_automations as plotter

In [2]:
df = pd.read_csv("../../../../01_color_clustering/non-weighted/omniart-post-color-clustering.csv")

In [3]:
# Add decade to creation intervals
df["decade"] = (df["creation_year"] // 10 * 10).astype(int)

In [4]:
color_clusters = pd.read_csv("../../../../01_color_clustering/non-weighted/color_centroids.csv")

**1. Split dataset: "Unknown" x other"**

In [5]:
unknown_df = df[df["school"].str.strip().str.lower() == "unknown"]

In [6]:
known_df = df[~(df["school"].str.strip().str.lower() == "unknown")]

**1. Features**

In [7]:
color_names = color_clusters["HEX"].tolist()
color_to_index = {name: i for i, name in enumerate(color_names)}
n_colors = len(color_names)

In [8]:
known_color_X = np.vstack(known_df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [9]:
unknown_color_X = np.vstack(unknown_df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [10]:
artist_encoder = LabelEncoder()

In [11]:

known_artist_X = artist_encoder.fit_transform(known_df["artist_full_name"]).reshape(-1, 1)

In [12]:
unknown_artist_X = artist_encoder.fit_transform(unknown_df["artist_full_name"]).reshape(-1, 1)

In [13]:
decade_encoder = LabelEncoder()

In [14]:
known_decade_X = decade_encoder.fit_transform(known_df["decade"]).reshape(-1, 1)

In [15]:
unknown_decade_X = decade_encoder.fit_transform(unknown_df["decade"]).reshape(-1, 1)

In [16]:
known_X = np.hstack([known_color_X, known_artist_X, known_decade_X])

In [17]:
unknown_X = np.hstack([unknown_color_X, unknown_artist_X, unknown_decade_X])

**2. Labels: split between known and unknown schools**

In [18]:
y_unknown = unknown_df["school"].copy()

In [19]:
y_known = known_df["school"].copy()

In [20]:
# Encode only known schools
label_encoder = LabelEncoder()
y_known_encoded = label_encoder.fit_transform(y_known)

**3. Train Random Forest Classifier**

In [21]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(known_X, y_known_encoded)

**4. Predict school for "unknown" rows**

In [22]:
probas = clf.predict_proba(unknown_X)
predicted_indices = np.argmax(probas, axis=1)
predicted_schools = label_encoder.inverse_transform(predicted_indices)
confidences = np.max(probas, axis=1)

In [29]:
unknown_df.copy()
unknown_df.loc[:, "predicted_school"] = predicted_schools
unknown_df.loc[:, "confidence"] = confidences

**5. Prediction color similarity**

In [24]:
features_known = known_color_X
labels_known = y_known_encoded
similarities = []

In [25]:
for x_u, pred_idx in zip(unknown_color_X, predicted_indices):
    class_samples = features_known[labels_known == pred_idx]
    class_mean = class_samples.mean(axis=0)
    sim = cosine_similarity([x_u], [class_mean])[0, 0]
    similarities.append(sim)

In [38]:
unknown_df.loc[:, "color_similarity"] = similarities

In [41]:
unknown_df.sort_values(by="confidence", ascending=False, inplace=True)

In [42]:
unknown_df

Unnamed: 0,artwork_name,artist_full_name,creation_year,century,school,palette_count,cluster_names,cluster_hex,decade,predicted_school,confidence,color_similarity
10706,Geniuses of arts,françois boucher,1761.0,18.0,Unknown,"[3612, 7910, 3806, 4778, 7197, 5542, 6642, 496...","['pale-rose', 'light-mauve', 'brownish', 'redd...","['#ecc0be', '#be9d9b', '#9e6163', '#a07a79', '...",1760,Italy,1.00,0.0
128158,Femme et enfant,francis gruber,1939.0,20.0,Unknown,"[5609, 6553, 6408, 6379, 4597, 4588, 7784, 332...","['brownish-purple', 'brownish-purple', 'browni...","['#775453', '#775453', '#775453', '#564544', '...",1930,Italy,1.00,0.0
102247,"Chrysanthemums by a stream, with rocks",ito jakuchu,1760.0,18.0,Unknown,"[3872, 6383, 5869, 4946, 6883, 8231, 2720, 475...","['blush', 'salmon', 'dark-salmon', 'dull-red',...","['#eb9c99', '#ed7873', '#cd5853', '#ab3d3c', '...",1760,Italy,1.00,0.0
76533,Le cri,ivan tovar,1974.0,20.0,Unknown,"[4800, 4418, 1299, 2146, 5818, 7510, 7616, 408...","['light-grey', 'dull-red', 'dark-rose', 'dark-...","['#ecdddd', '#c33639', '#ae5856', '#2f2b2b', '...",1970,Italy,1.00,0.0
16401,St. Bonaventura Receiving the Banner of St. Se...,francesco solimena,1710.0,18.0,Unknown,"[3740, 4509, 5649, 7872, 4410, 4234, 3882, 499...","['warm-grey', 'brownish-purple', 'dark', 'dark...","['#938888', '#884242', '#302121', '#302121', '...",1710,Italy,1.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
163387,St. Peter Enthroned with Saints,cima da conegliano,1490.0,15.0,Unknown,"[3775, 4794, 4429, 5925, 6767, 6471, 4849, 496...","['pinkish-grey', 'greyish-pink', 'reddish-brow...","['#d4b7b9', '#c58f90', '#6b2423', '#564544', '...",1490,Flanders,0.17,0.0
142602,St. Jerome in the Wilderness,cima da conegliano,1495.0,15.0,Unknown,"[4184, 5718, 6706, 4401, 4938, 4904, 7244, 520...","['light-grey', 'pinkish-grey', 'pinkish-grey',...","['#ecdddd', '#bdb1b0', '#bdb1b0', '#ae5856', '...",1490,Flanders,0.17,0.0
242826,Madonna and Child,cima da conegliano,1497.0,15.0,Unknown,"[6147, 5574, 5199, 4591, 4096, 3827, 8436, 418...","['pinkish', 'dark-rose', 'brick', 'purple-brow...","['#d17274', '#ae5856', '#a71f22', '#642f2f', '...",1490,Flanders,0.17,0.0
111507,The Healing of Anianus,cima da conegliano,1498.0,15.0,Unknown,"[3906, 5127, 4914, 5348, 7094, 4698, 6160, 494...","['blush', 'dark-salmon', 'dark-brown', 'blush'...","['#eb9c99', '#cd5853', '#291313', '#eb9c99', '...",1490,Flanders,0.17,0.0


In [47]:
unknown_df.to_csv("unknown_school_predictions.csv", index=False)

**6. Evaluate**

In [45]:
unknown_summary = unknown_df.groupby("predicted_school").agg(
    count=("predicted_school", "count"),
    avg_confidence=("confidence", "mean"),
    avg_similarity=("color_similarity", "mean")
).sort_values(by="count", ascending=False)

In [46]:
unknown_summary

Unnamed: 0_level_0,count,avg_confidence,avg_similarity
predicted_school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,26245,0.758753,0.0
Flanders,8430,0.610391,0.0
Netherlands,3368,0.490929,0.0
Spain,2679,0.574218,0.0
Germany,2658,0.656421,0.0
England,1762,0.333598,0.0
Austria,684,0.415731,0.0
Japan,321,0.338287,0.0
Greece,314,0.590872,0.0
France,158,0.64496,0.0


**7. Features importance**

In [48]:
importances = clf.feature_importances_

In [53]:
color_feature_names = color_names
artist_feature_name = ["artist"]
decade_feature_name = ["decade"]

feature_names = color_feature_names + artist_feature_name + decade_feature_name

In [54]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [55]:
importance_df

Unnamed: 0,feature,importance
1001,decade,0.683752
1000,artist,0.316248
672,#c1afbe,0.000000
659,#b39341,0.000000
660,#ed5714,0.000000
...,...,...
339,#dac175,0.000000
340,#9aabbc,0.000000
341,#52697d,0.000000
342,#8cb99d,0.000000


In [56]:
importance_df.to_csv("unkown_school_feature_importance.csv", index=False)