In [1]:
import sys
import os
sys.path.append(os.path.abspath("../../../../"))

import numpy as np
import pandas as pd
import utils
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
# Evaluation
from sklearn.metrics.pairwise import cosine_similarity
import plot_automations as plotter

In [2]:
df = pd.read_csv("../../../../01_color_clustering/weighted/weighted_omniart-post-color-clustering.csv")

In [3]:
# Add decade to creation intervals
df["decade"] = (df["creation_year"] // 10 * 10).astype(int)

In [4]:
color_clusters = pd.read_csv("../../../../01_color_clustering/weighted/weighted_color_centroids.csv")

**1. Split dataset: "Unknown" x other"**

In [5]:
unknown_df = df[df["school"].str.strip().str.lower() == "unknown"]

In [6]:
known_df = df[~(df["school"].str.strip().str.lower() == "unknown")]

**1. Features**

In [7]:
color_names = color_clusters["HEX"].tolist()
color_to_index = {name: i for i, name in enumerate(color_names)}
n_colors = len(color_names)

In [8]:
known_color_X = np.vstack(known_df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [9]:
unknown_color_X = np.vstack(unknown_df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [10]:
artist_encoder = LabelEncoder()

In [11]:

known_artist_X = artist_encoder.fit_transform(known_df["artist_full_name"]).reshape(-1, 1)

In [12]:
unknown_artist_X = artist_encoder.fit_transform(unknown_df["artist_full_name"]).reshape(-1, 1)

In [13]:
decade_encoder = LabelEncoder()

In [14]:
known_decade_X = decade_encoder.fit_transform(known_df["decade"]).reshape(-1, 1)

In [15]:
unknown_decade_X = decade_encoder.fit_transform(unknown_df["decade"]).reshape(-1, 1)

In [16]:
known_X = np.hstack([known_color_X, known_artist_X, known_decade_X])

In [17]:
unknown_X = np.hstack([unknown_color_X, unknown_artist_X, unknown_decade_X])

**2. Labels: split between known and unknown schools**

In [18]:
y_unknown = unknown_df["school"].copy()

In [19]:
y_known = known_df["school"].copy()

In [20]:
# Encode only known schools
label_encoder = LabelEncoder()
y_known_encoded = label_encoder.fit_transform(y_known)

**3. Train Random Forest Classifier**

In [21]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(known_X, y_known_encoded)

**4. Predict school for "unknown" rows**

In [None]:
probas = clf.predict_proba(unknown_X)
predicted_indices = np.argmax(probas, axis=1)
predicted_schools = label_encoder.inverse_transform(predicted_indices)
confidences = np.max(probas, axis=1)

In [None]:
unknown_df.copy()
unknown_df.loc[:, "predicted_school"] = predicted_schools
unknown_df.loc[:, "confidence"] = confidences

**5. Prediction color similarity**

In [None]:
features_known = known_color_X
labels_known = y_known_encoded
similarities = []

In [None]:
for x_u, pred_idx in zip(unknown_color_X, predicted_indices):
    class_samples = features_known[labels_known == pred_idx]
    class_mean = class_samples.mean(axis=0)
    sim = cosine_similarity([x_u], [class_mean])[0, 0]
    similarities.append(sim)

In [None]:
unknown_df.loc[:, "color_similarity"] = similarities

In [None]:
unknown_df.sort_values(by="confidence", ascending=False, inplace=True)

In [None]:
unknown_df

In [None]:
unknown_df.to_csv("unknown_school_predictions.csv", index=False)

**6. Evaluate**

In [None]:
unknown_summary = unknown_df.groupby("predicted_school").agg(
    count=("predicted_school", "count"),
    avg_confidence=("confidence", "mean"),
    avg_similarity=("color_similarity", "mean")
).sort_values(by="count", ascending=False)

In [None]:
unknown_summary

**7. Features importance**

In [None]:
importances = clf.feature_importances_

In [None]:
color_feature_names = color_names
artist_feature_name = ["artist"]
decade_feature_name = ["decade"]

feature_names = color_feature_names + artist_feature_name + decade_feature_name

In [None]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [None]:
importance_df

In [None]:
importance_df.to_csv("weighted_unkown_school_feature_importance.csv", index=False)