In [20]:
import sys
import os
sys.path.append(os.path.abspath("../../../../"))

import numpy as np
import pandas as pd
import utils
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
# Evaluation
from sklearn.metrics.pairwise import cosine_similarity
import plot_automations as plotter

In [21]:
df = pd.read_csv("../../../../01_color_clustering/non-weighted/omniart-post-color-clustering.csv")

In [22]:
# Add decade to creation intervals
df["decade"] = (df["creation_year"] // 10 * 10).astype(int)

In [23]:
color_clusters = pd.read_csv("../../../../01_color_clustering/non-weighted/color_centroids.csv")

**1. Split dataset: "Unknown" x other"**

In [24]:
unknown_df = df[df["school"].str.strip().str.lower() == "unknown"]


In [25]:
known_df = df[~(df["school"].str.strip().str.lower() == "unknown")]

**1. Features**

In [26]:
color_names = color_clusters["HEX"].tolist()
color_to_index = {name: i for i, name in enumerate(color_names)}
n_colors = len(color_names)

In [27]:
known_color_X = np.vstack(known_df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [28]:
known_X = known_color_X

In [29]:
unknown_color_X = np.vstack(unknown_df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [30]:
unknown_X = unknown_color_X

**2. Labels: split between known and unknown schools**

In [31]:
y_unknown = unknown_df["school"].copy()

In [32]:
y_known = known_df["school"].copy()

In [33]:
# Encode only known schools
label_encoder = LabelEncoder()
y_known_encoded = label_encoder.fit_transform(y_known)

**3. Train Random Forest Classifier**

In [34]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(known_X, y_known_encoded)

**4. Predict school for "unknown" rows**

In [35]:
probas = clf.predict_proba(unknown_X)
predicted_indices = np.argmax(probas, axis=1)
predicted_schools = label_encoder.inverse_transform(predicted_indices)
confidences = np.max(probas, axis=1)

In [36]:
unknown_df = unknown_df.copy()
unknown_df.loc[:, "predicted_school"] = predicted_schools
unknown_df.loc[:, "confidence"] = confidences

In [37]:
unknown_df.sort_values(by="confidence", ascending=False, inplace=True)

In [38]:
unknown_df

Unnamed: 0,artwork_name,artist_full_name,creation_year,century,school,palette_count,cluster_hex,decade,predicted_school,confidence
138,Officers Playing Tric Trac,willem cornelisz duyster,1650.0,17.0,Unknown,"[3907, 4482, 4423, 3080, 7328, 4675, 7989, 661...","['#ae5856', '#884242', '#642f2f', '#481819', '...",1650,modern,0.847212
176789,Distribution of Herring and White Bread During...,otto van veen,1590.0,16.0,Unknown,"[4016, 5233, 4515, 5618, 4312, 5514, 6439, 596...","['#d4b7b9', '#a07a79', '#8b2c2b', '#6b2423', '...",1590,modern,0.847212
176791,The Landing of Richard II at Milford Haven,william hamilton,1800.0,19.0,Unknown,"[4246, 5623, 5923, 5343, 7971, 3539, 6672, 358...","['#91211b', '#884242', '#775453', '#5d1217', '...",1800,modern,0.847212
176792,The Gallant Charge of the Kentucky Cavalry Und...,currier and ives,1847.0,19.0,Unknown,"[5080, 5937, 8049, 8172, 3905, 3948, 3699, 485...","['#ecdddd', '#bdb1b0', '#564544', '#481819', '...",1840,modern,0.847212
176793,Nikita Pustosviat. Dispute on the Confession o...,vasily perov,1881.0,19.0,Unknown,"[3671, 5084, 6741, 4797, 5483, 5677, 4176, 581...","['#be9d9b', '#9e6163', '#734543', '#642f2f', '...",1880,modern,0.847212
...,...,...,...,...,...,...,...,...,...,...
88857,Sitting Woman,vytautas kairiukstis,1930.0,20.0,Unknown,"[2151, 4881, 9849, 5077, 4076, 2604, 2920, 734...","['#eb9c99', '#cd5853', '#be9d9b', '#c33639', '...",1930,modern,0.847212
88858,Naked Man with Rat,lucian freud,1977.0,20.0,Unknown,"[5444, 6455, 4733, 6279, 4652, 6709, 4069, 503...","['#ecc0be', '#eb9c99', '#d17274', '#ae5856', '...",1970,modern,0.847212
88859,Gabriele on the bamboo bed 2,gazmend freitag,2008.0,21.0,Unknown,"[8031, 3325, 4571, 4563, 7403, 4998, 2966, 555...","['#938888', '#938888', '#938888', '#938888', '...",2000,modern,0.847212
88860,A Vase of Flowers with a Watch,willem van aelst,1650.0,17.0,Unknown,"[1332, 11718, 11523, 3689, 5097, 3671, 3983, 4...","['#291313', '#302121', '#302121', '#bdb1b0', '...",1650,modern,0.847212


In [39]:
unknown_df.to_csv("color-unknown_school_predictions.csv", index=False)

**6. Evaluate**

In [41]:
unknown_summary = unknown_df.groupby("predicted_school").agg(
    count=("predicted_school", "count"),
    avg_confidence=("confidence", "mean"),
).sort_values(by="count", ascending=False)

In [42]:
unknown_summary

Unnamed: 0_level_0,count,avg_confidence
predicted_school,Unnamed: 1_level_1,Unnamed: 2_level_1
modern,46860,0.847212


**7. Features importance**

In [43]:
importances = clf.feature_importances_

In [44]:
color_feature_names = color_names

feature_names = color_feature_names

In [45]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [46]:
importance_df

Unnamed: 0,feature,importance
0,#e73041,0.0
671,#75b598,0.0
658,#989678,0.0
659,#b39341,0.0
660,#ed5714,0.0
...,...,...
338,#7b9d60,0.0
339,#dac175,0.0
340,#9aabbc,0.0
341,#52697d,0.0


In [47]:
importance_df.to_csv("color-unkown-school_feature_importance.csv", index=False)