In [1]:
import sys
import os
sys.path.append(os.path.abspath("../../../../"))

import pandas as pd
import numpy as np
import utils
# Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Evaluation
from sklearn.metrics import classification_report
import plot_automations as plotter

In [2]:
df = pd.read_csv("../../../../01_color_clustering/weighted/weighted_omniart-post-color-clustering.csv")

In [3]:
# Add decade to creation intervals
df["decade"] = (df["creation_year"] // 10 * 10).astype(int)

In [4]:
color_clusters = pd.read_csv("../../../../01_color_clustering/weighted/weighted_color_centroids.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../../../01_color_clustering/weighted/weighted_color_centroids.csv'

**1. Features**

In [None]:
color_names = color_clusters["HEX"].tolist()
color_to_index = {name: i for i, name in enumerate(color_names)}
n_colors = len(color_names)

In [None]:
color_X = np.vstack(df.apply(lambda row: utils.artwork_to_vector(row, n_colors, color_to_index), axis=1))

In [None]:
artist_encoder = LabelEncoder()
artist_X = artist_encoder.fit_transform(df["artist_full_name"]).reshape(-1, 1)

In [None]:
school_encoder = LabelEncoder()
school_X = school_encoder.fit_transform(df["school"]).reshape(-1, 1)

In [None]:
X = np.hstack([color_X, artist_X, school_X])

**2. Labels**

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["decade"])

**3. Split dataset**

In [None]:
X_train,  X_test,  y_train,  y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**4. Train Random Forest Classifier**

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(X_train, y_train)

In [None]:
class_labels = label_encoder.inverse_transform(np.arange(len(label_encoder.classes_)))

**5. Predict**

In [None]:
y_pred = clf.predict(X_test)

**6. Evaluate**

In [None]:
labels_in_test = np.unique(y_test)

In [None]:
class_names_in_test = [str(label_encoder.classes_[i]) for i in labels_in_test]

In [None]:
# Generate report as dict
report_dict = classification_report(
    y_test, y_pred,
    labels=labels_in_test,
    target_names=class_names_in_test,
    zero_division=0,
    output_dict=True
)

In [None]:
report = pd.DataFrame(report_dict).transpose()

In [None]:
# Sort by support
summary_rows = report.loc[report.index.str.contains("avg|accuracy")]
class_rows = report.loc[~report.index.str.contains("avg|accuracy")].copy()
class_rows["support"] = class_rows["support"].astype(int)
class_rows_sorted = class_rows.sort_values(by="support", ascending=False)

In [None]:
report = pd.concat([class_rows_sorted, summary_rows])

In [None]:
report

In [None]:
report.to_csv("weighted_decade_classifier_report.csv")

**7. Features importance**

In [None]:
importances = clf.feature_importances_

In [None]:
color_feature_names = color_names
artist_feature_name = ["artist"]
school_feature_name = ["school"]

feature_names = color_feature_names + artist_feature_name + school_feature_name

In [None]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [None]:
importance_df

In [None]:
importance_df.to_csv("weighted_decade_feature_importance.csv", index=False)